[clang-tools-extra] [llvm] [clang-doc] Add standalone Markdown parsing library (PR #202991)

Sat Jun 13 13:33:34 PDT 2026

https://github.com/Neil-N4 updated https://github.com/llvm/llvm-project/pull/202991

>From b4623400ed04066b222882d46bbddf1819ffca9e Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 09:51:48 -0400
Subject: [PATCH 01/27] [clang-doc] Add standalone Markdown parsing library

---
 .../clang-doc/support/CMakeLists.txt          |   3 +-
 .../clang-doc/support/Markdown.cpp            | 145 ++++++++++++++++++
 .../clang-doc/support/Markdown.h              |  72 +++++++++
 .../unittests/clang-doc/CMakeLists.txt        |   4 +-
 .../clang-doc/MarkdownParserTest.cpp          |  94 ++++++++++++
 5 files changed, 316 insertions(+), 2 deletions(-)
 create mode 100644 clang-tools-extra/clang-doc/support/Markdown.cpp
 create mode 100644 clang-tools-extra/clang-doc/support/Markdown.h
 create mode 100644 clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp

diff --git a/clang-tools-extra/clang-doc/support/CMakeLists.txt b/clang-tools-extra/clang-doc/support/CMakeLists.txt
index 8ac913ffbe998..acff865190ff9 100644
--- a/clang-tools-extra/clang-doc/support/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/support/CMakeLists.txt
@@ -6,5 +6,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_library(clangDocSupport STATIC
   File.cpp
+  Markdown.cpp
   Utils.cpp
-  )
+  )
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
new file mode 100644
index 0000000000000..776150b939d27
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -0,0 +1,145 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DebugLog.h"
+
+#define DEBUG_TYPE "clang-doc-markdown"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+static MDNode makeText(StringRef S) {
+  return {NodeKind::NK_Text, S, {}};
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+  return Line.contains('-') &&
+         Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+  return Line.starts_with("- ") || Line.starts_with("* ") ||
+         Line.starts_with("+ ");
+}
+
+static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
+                                      BumpPtrAllocator &Arena) {
+  if (Nodes.empty())
+    return {};
+  MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
+  std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
+  return ArrayRef<MDNode>(Allocated, Nodes.size());
+}
+
+ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
+                               BumpPtrAllocator &Arena) {
+  if (ParagraphText.trim().empty())
+    return {};
+
+  SmallVector<StringRef, 16> Lines;
+  ParagraphText.split(Lines, '\n');
+
+  SmallVector<MDNode> Nodes;
+  size_t I = 0, E = Lines.size();
+
+  while (I < E) {
+    StringRef Line = Lines[I].trim();
+
+    if (Line.empty()) {
+      ++I;
+      continue;
+    }
+
+    // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+    // indented up to 3 spaces, the closing fence must use the same character
+    // and be at least as long as the opening fence, and the closing fence may
+    // only be followed by spaces. Doxygen specifics should be handled on a
+    // case-by-case basis.
+    if (Line.starts_with("```") || Line.starts_with("~~~")) {
+      char Fence = Line[0];
+      StringRef Lang = Line.drop_front(3).trim();
+      SmallVector<MDNode> CodeLines;
+      ++I;
+      while (I < E) {
+        StringRef CodeLine = Lines[I].trim();
+        if (CodeLine.size() >= 3 &&
+            all_of(CodeLine.take_front(3),
+                   [Fence](char C) { return C == Fence; }))
+          break;
+        CodeLines.push_back(makeText(Lines[I]));
+        ++I;
+      }
+      ++I; // skip closing fence
+      MDNode Code;
+      Code.Kind = NodeKind::NK_FencedCode;
+      Code.Content = Lang;
+      Code.Children = allocateNodes(CodeLines, Arena);
+      LDBG() << "emitting NK_FencedCode lang='" << Lang
+             << "' lines=" << CodeLines.size();
+      Nodes.push_back(Code);
+      continue;
+    }
+
+    // Pipe table: current line has | and next line is a separator row.
+    if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+      SmallVector<MDNode> Rows;
+      while (I < E && Lines[I].trim().contains('|')) {
+        Rows.push_back(makeText(Lines[I].trim()));
+        ++I;
+      }
+      MDNode Table;
+      Table.Kind = NodeKind::NK_Table;
+      Table.Content = {};
+      Table.Children = allocateNodes(Rows, Arena);
+      LDBG() << "emitting NK_Table rows=" << Rows.size();
+      Nodes.push_back(Table);
+      continue;
+    }
+
+    // Unordered list item.
+    if (isListItem(Line)) {
+      SmallVector<MDNode> Items;
+      while (I < E) {
+        StringRef L = Lines[I].trim();
+        if (!isListItem(L))
+          break;
+        MDNode Item;
+        Item.Kind = NodeKind::NK_ListItem;
+        Item.Content = L.drop_front(2).trim();
+        Item.Children = {};
+        Items.push_back(Item);
+        ++I;
+      }
+      MDNode List;
+      List.Kind = NodeKind::NK_UnorderedList;
+      List.Content = {};
+      List.Children = allocateNodes(Items, Arena);
+      LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+      Nodes.push_back(List);
+      continue;
+    }
+
+    // Plain text fallback.
+    Nodes.push_back(makeText(Line));
+    ++I;
+  }
+
+  LDBG() << "parseMarkdown done nodes=" << Nodes.size();
+  return allocateNodes(Nodes, Arena);
+}
+
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
new file mode 100644
index 0000000000000..890f764f937b1
--- /dev/null
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines a standalone Markdown parsing library for the LLVM
+/// ecosystem. The parser takes plain text and returns a tree of typed nodes
+/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+///
+/// This is a simple Markdown parser for use inside Clang-Doc's comment
+/// pipeline. You give it a paragraph of text and an arena allocator, and it
+/// gives back a list of typed nodes describing the Markdown structure it found.
+///
+/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
+/// you get back an empty list and can fall back to plain-text output. If it
+/// does, you get a tree of MDNode structs where each node has a kind, optional
+/// content (like the language tag on a code fence), and optional children.
+///
+/// All nodes are allocated in the arena you pass in. You own the arena and are
+/// responsible for keeping it alive as long as you use the nodes.
+///
+/// The parser handles fenced code blocks, pipe tables, and unordered lists.
+/// Anything it does not recognize comes back as a plain text node. It will
+/// never crash on bad input.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+
+namespace clang::doc::markdown {
+
+enum class NodeKind {
+  // Block nodes
+  NK_Paragraph,
+  NK_FencedCode,
+  NK_Table,
+  NK_UnorderedList,
+  NK_OrderedList,
+  NK_ListItem,
+  NK_ThematicBreak,
+  // Inline nodes
+  NK_Text,
+  NK_InlineCode,
+  NK_Emphasis,
+  NK_Strong,
+  NK_SoftBreak,
+};
+
+struct MDNode {
+  NodeKind Kind;
+  llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
+  llvm::ArrayRef<MDNode> Children; // arena allocated
+};
+
+/// Parses Markdown from a single comment paragraph's text.
+/// Returns an empty ArrayRef if no Markdown constructs are found,
+/// so generators can fall back to plain-text rendering at zero cost.
+llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
+                                     llvm::BumpPtrAllocator &Arena);
+
+} // namespace clang::doc::markdown
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
index 01b34ec9a791e..b74207ac88fa7 100644
--- a/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/unittests/clang-doc/CMakeLists.txt
@@ -26,6 +26,7 @@ add_extra_unittest(ClangDocTests
   ClangDocTest.cpp
   GeneratorTest.cpp
   HTMLGeneratorTest.cpp
+  MarkdownParserTest.cpp
   MDGeneratorTest.cpp
   MergeTest.cpp
   SerializeTest.cpp
@@ -49,5 +50,6 @@ clang_target_link_libraries(ClangDocTests
 target_link_libraries(ClangDocTests
   PRIVATE
   clangDoc
+  clangDocSupport
   LLVMTestingSupport
-  )
+  )
\ No newline at end of file
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
new file mode 100644
index 0000000000000..8df5efc7f1d5f
--- /dev/null
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -0,0 +1,94 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/Support/Allocator.h"
+#include "gtest/gtest.h"
+
+using namespace clang::doc::markdown;
+
+namespace {
+
+TEST(MarkdownParserTest, EmptyInput) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("", Arena);
+  EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, WhitespaceOnlyInput) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("   \n  \n", Arena);
+  EXPECT_TRUE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PlainText) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("hello world", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
+  EXPECT_EQ(Nodes[0].Content, "hello world");
+}
+
+TEST(MarkdownParserTest, FencedCodeBlock) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(Nodes[0].Content, "cpp");
+  ASSERT_EQ(Nodes[0].Children.size(), 1u);
+}
+
+TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
+  EXPECT_TRUE(Nodes[0].Content.empty());
+}
+
+TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+  // Unterminated fence should not crash and should produce a code node
+  // with whatever lines were found.
+  EXPECT_FALSE(Nodes.empty());
+}
+
+TEST(MarkdownParserTest, PipeTable) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("a | b\nc | d", Arena);
+  // No separator row so should not be parsed as a table
+  for (const auto &Node : Nodes)
+    EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+}
+
+TEST(MarkdownParserTest, UnorderedList) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
+  ASSERT_EQ(Nodes[0].Children.size(), 3u);
+  EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
+  EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
+  EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+}
+
+TEST(MarkdownParserTest, MixedContent) {
+  llvm::BumpPtrAllocator Arena;
+  auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+  EXPECT_EQ(Nodes.size(), 3u);
+}
+
+} // namespace
\ No newline at end of file

>From f4cb4a28630e0f91289bfd4416c59114c5654ff7 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 11:35:54 -0400
Subject: [PATCH 02/27] [clang-doc] Address review feedback: test fixture, raw
 strings, DEBUG_TYPE, EOF newlines

---
 .../clang-doc/support/Markdown.cpp            |  4 +-
 .../clang-doc/support/Markdown.h              |  2 +-
 .../clang-doc/MarkdownParserTest.cpp          | 97 +++++++++++--------
 3 files changed, 61 insertions(+), 42 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 776150b939d27..9e008abf8b08d 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -12,7 +12,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLog.h"
 
-#define DEBUG_TYPE "clang-doc-markdown"
+#define DEBUG_TYPE "clang-doc"
 
 using namespace llvm;
 
@@ -142,4 +142,4 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
   return allocateNodes(Nodes, Arena);
 }
 
-} // namespace clang::doc::markdown
\ No newline at end of file
+} // namespace clang::doc::markdown
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 890f764f937b1..09b79cc8f2437 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -69,4 +69,4 @@ llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
 
 } // namespace clang::doc::markdown
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
\ No newline at end of file
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 8df5efc7f1d5f..ff9bad88da136 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -14,80 +14,99 @@ using namespace clang::doc::markdown;
 
 namespace {
 
-TEST(MarkdownParserTest, EmptyInput) {
+struct MarkdownParserTest : public ::testing::Test {
   llvm::BumpPtrAllocator Arena;
+};
+
+TEST_F(MarkdownParserTest, EmptyInput) {
   auto Nodes = parseMarkdown("", Arena);
   EXPECT_TRUE(Nodes.empty());
 }
 
-TEST(MarkdownParserTest, WhitespaceOnlyInput) {
-  llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
   auto Nodes = parseMarkdown("   \n  \n", Arena);
   EXPECT_TRUE(Nodes.empty());
 }
 
-TEST(MarkdownParserTest, PlainText) {
-  llvm::BumpPtrAllocator Arena;
+TEST_F(MarkdownParserTest, PlainText) {
   auto Nodes = parseMarkdown("hello world", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Text);
-  EXPECT_EQ(Nodes[0].Content, "hello world");
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_Text);
+  EXPECT_EQ(N.Content, "hello world");
 }
 
-TEST(MarkdownParserTest, FencedCodeBlock) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("```cpp\nint x = 0;\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlock) {
+  auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;
+````)",
+                             Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(Nodes[0].Content, "cpp");
-  ASSERT_EQ(Nodes[0].Children.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(N.Content, "cpp");
+  ASSERT_EQ(N.Children.size(), 1u);
 }
 
-TEST(MarkdownParserTest, FencedCodeBlockNoLang) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("```\nsome code\n```", Arena);
+TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
+  auto Nodes = parseMarkdown(R"(```
+some code
+```)",
+                             Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_FencedCode);
-  EXPECT_TRUE(Nodes[0].Content.empty());
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_TRUE(N.Content.empty());
 }
 
-TEST(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("```cpp\nint x = 0;", Arena);
+TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+  auto Nodes = parseMarkdown(R"(```cpp
+int x = 0;)",
+                             Arena);
   // Unterminated fence should not crash and should produce a code node
   // with whatever lines were found.
   EXPECT_FALSE(Nodes.empty());
 }
 
-TEST(MarkdownParserTest, PipeTable) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("| A | B |\n|---|---|\n| 1 | 2 |", Arena);
+TEST_F(MarkdownParserTest, PipeTable) {
+  auto Nodes = parseMarkdown(R"(| A | B |
+|---|---|
+| 1 | 2 |)",
+                             Arena);
   ASSERT_EQ(Nodes.size(), 1u);
   EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
 }
 
-TEST(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("a | b\nc | d", Arena);
-  // No separator row so should not be parsed as a table
+TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
+  auto Nodes = parseMarkdown(R"(a | b
+c | d)",
+                             Arena);
+  // No separator row so should not be parsed as a table.
   for (const auto &Node : Nodes)
     EXPECT_NE(Node.Kind, NodeKind::NK_Table);
 }
 
-TEST(MarkdownParserTest, UnorderedList) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("- foo\n- bar\n- baz", Arena);
+TEST_F(MarkdownParserTest, UnorderedList) {
+  auto Nodes = parseMarkdown(R"(- foo
+- bar
+- baz)",
+                             Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_UnorderedList);
-  ASSERT_EQ(Nodes[0].Children.size(), 3u);
-  EXPECT_EQ(Nodes[0].Children[0].Content, "foo");
-  EXPECT_EQ(Nodes[0].Children[1].Content, "bar");
-  EXPECT_EQ(Nodes[0].Children[2].Content, "baz");
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
+  ASSERT_EQ(N.Children.size(), 3u);
+  EXPECT_EQ(N.Children[0].Content, "foo");
+  EXPECT_EQ(N.Children[1].Content, "bar");
+  EXPECT_EQ(N.Children[2].Content, "baz");
 }
 
-TEST(MarkdownParserTest, MixedContent) {
-  llvm::BumpPtrAllocator Arena;
-  auto Nodes = parseMarkdown("some text\n```\ncode\n```\n- item", Arena);
+TEST_F(MarkdownParserTest, MixedContent) {
+  auto Nodes = parseMarkdown(R"(some text
+```
+code
+````
+- item)",
+                             Arena);
   EXPECT_EQ(Nodes.size(), 3u);
 }
 

>From 3ef8f62edab311caff0907ab2b9a0c3aaeb14353 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 13:45:44 -0400
Subject: [PATCH 03/27] [clang-doc] Add CommonMark spec tests for fenced code
 blocks

---
 .../clang-doc/MarkdownParserTest.cpp          | 112 +++++++++++++++++-
 1 file changed, 108 insertions(+), 4 deletions(-)

diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ff9bad88da136..4ca979c1f1d24 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -39,7 +39,7 @@ TEST_F(MarkdownParserTest, PlainText) {
 TEST_F(MarkdownParserTest, FencedCodeBlock) {
   auto Nodes = parseMarkdown(R"(```cpp
 int x = 0;
-````)",
+````````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
   const auto &N = Nodes[0];
@@ -51,7 +51,7 @@ int x = 0;
 TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
   auto Nodes = parseMarkdown(R"(```
 some code
-```)",
+```````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
   const auto &N = Nodes[0];
@@ -102,12 +102,116 @@ TEST_F(MarkdownParserTest, UnorderedList) {
 
 TEST_F(MarkdownParserTest, MixedContent) {
   auto Nodes = parseMarkdown(R"(some text
-```
+```````
 code
-````
+````````
 - item)",
                              Arena);
   EXPECT_EQ(Nodes.size(), 3u);
 }
 
+// CommonMark §4.5 example 120: tilde fences work the same as backtick fences.
+TEST_F(MarkdownParserTest, TildeFence) {
+  auto Nodes = parseMarkdown(R"(~~~
+int x = 0;
+~~~)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_TRUE(N.Content.empty());
+  ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 120: tilde fence with a language tag.
+TEST_F(MarkdownParserTest, TildeFenceWithLang) {
+  auto Nodes = parseMarkdown(R"(~~~cpp
+int x = 0;
+~~~)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(N.Content, "cpp");
+  ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
+TEST_F(MarkdownParserTest, ClosingFenceMustMatchOpeningChar) {
+  auto Nodes = parseMarkdown(R"(```
+aaa
+~~~
+````````)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  // ~~~ is content, not a closing fence.
+  ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 130: a code block can be empty.
+TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
+  auto Nodes = parseMarkdown(R"(```
+```````)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_TRUE(N.Children.empty());
+}
+
+// CommonMark §4.5 example 129: a code block may contain only blank lines.
+TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
+  auto Nodes = parseMarkdown("```\n\n  \n```", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  ASSERT_EQ(N.Children.size(), 2u);
+}
+
+// CommonMark §4.5 example 142: lang tag is captured from the info string.
+TEST_F(MarkdownParserTest, InfoStringLangTag) {
+  auto Nodes = parseMarkdown(R"(```ruby
+def foo(x)
+  return 3
+end
+``````)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(N.Content, "ruby");
+  ASSERT_EQ(N.Children.size(), 3u);
+}
+
+// CommonMark §4.5 example 146: tilde fence info string may contain backticks.
+TEST_F(MarkdownParserTest, TildeFenceInfoStringWithBackticks) {
+  auto Nodes = parseMarkdown(R"(~~~ aa ``` ~~~
+foo
+~~~)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  EXPECT_EQ(N.Content, "aa ``` ~~~");
+  ASSERT_EQ(N.Children.size(), 1u);
+}
+
+// CommonMark §4.5 example 124: closing fence must be at least as long as the
+// opening fence.
+// TODO: our parser currently closes on the first line with 3 matching fence
+// chars regardless of opening fence length. Fix as part of the CommonMark
+// TODO in parseMarkdown().
+TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
+  auto Nodes = parseMarkdown("````\naaa\n```", Arena);
+  // The ``` line should not close the ```` fence per CommonMark, but our
+  // parser currently treats it as a closing fence. This test documents the
+  // current (non-conformant) behavior.
+  ASSERT_EQ(Nodes.size(), 1u);
+  const auto &N = Nodes[0];
+  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  ASSERT_EQ(N.Children.size(), 1u);
+}
+
 } // namespace
\ No newline at end of file

>From ffb56028d83a542a775119a7b0c2f88271b2df84 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Wed, 10 Jun 2026 13:59:52 -0400
Subject: [PATCH 04/27] [clang-doc] Replace flat MDNode with typed node
 hierarchy using LLVM RTTI

---
 .../clang-doc/support/Markdown.cpp            |  84 +++---
 .../clang-doc/support/Markdown.h              | 264 ++++++++++++++++--
 .../clang-doc/MarkdownParserTest.cpp          |  84 +++---
 3 files changed, 312 insertions(+), 120 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 9e008abf8b08d..bee15c3e23ec3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -18,8 +18,24 @@ using namespace llvm;
 
 namespace clang::doc::markdown {
 
-static MDNode makeText(StringRef S) {
-  return {NodeKind::NK_Text, S, {}};
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+                                 BumpPtrAllocator &Arena) {
+  if (Vec.empty())
+    return {};
+  T *Allocated = Arena.Allocate<T>(Vec.size());
+  std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+  return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// Interns a StringRef into the arena so it outlives the parse loop.
+static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
+  if (S.empty())
+    return {};
+  char *Buf = Arena.Allocate<char>(S.size());
+  std::copy(S.begin(), S.end(), Buf);
+  return StringRef(Buf, S.size());
 }
 
 // A line is a table separator if it only contains |, -, :, and spaces,
@@ -36,24 +52,15 @@ static bool isListItem(StringRef Line) {
          Line.starts_with("+ ");
 }
 
-static ArrayRef<MDNode> allocateNodes(const SmallVectorImpl<MDNode> &Nodes,
-                                      BumpPtrAllocator &Arena) {
-  if (Nodes.empty())
-    return {};
-  MDNode *Allocated = Arena.Allocate<MDNode>(Nodes.size());
-  std::uninitialized_copy(Nodes.begin(), Nodes.end(), Allocated);
-  return ArrayRef<MDNode>(Allocated, Nodes.size());
-}
-
-ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
-                               BumpPtrAllocator &Arena) {
+ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
+                                 BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
     return {};
 
   SmallVector<StringRef, 16> Lines;
   ParagraphText.split(Lines, '\n');
 
-  SmallVector<MDNode> Nodes;
+  SmallVector<MDNode *> Nodes;
   size_t I = 0, E = Lines.size();
 
   while (I < E) {
@@ -71,8 +78,8 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
     // case-by-case basis.
     if (Line.starts_with("```") || Line.starts_with("~~~")) {
       char Fence = Line[0];
-      StringRef Lang = Line.drop_front(3).trim();
-      SmallVector<MDNode> CodeLines;
+      StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+      SmallVector<StringRef> CodeLines;
       ++I;
       while (I < E) {
         StringRef CodeLine = Lines[I].trim();
@@ -80,15 +87,13 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
             all_of(CodeLine.take_front(3),
                    [Fence](char C) { return C == Fence; }))
           break;
-        CodeLines.push_back(makeText(Lines[I]));
+        CodeLines.push_back(internString(Lines[I], Arena));
         ++I;
       }
       ++I; // skip closing fence
-      MDNode Code;
-      Code.Kind = NodeKind::NK_FencedCode;
-      Code.Content = Lang;
-      Code.Children = allocateNodes(CodeLines, Arena);
-      LDBG() << "emitting NK_FencedCode lang='" << Lang
+      auto *Code =
+          new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
+      LDBG() << "emitting FencedCodeNode lang='" << Lang
              << "' lines=" << CodeLines.size();
       Nodes.push_back(Code);
       continue;
@@ -96,50 +101,45 @@ ArrayRef<MDNode> parseMarkdown(StringRef ParagraphText,
 
     // Pipe table: current line has | and next line is a separator row.
     if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
-      SmallVector<MDNode> Rows;
+      SmallVector<StringRef> Rows;
       while (I < E && Lines[I].trim().contains('|')) {
-        Rows.push_back(makeText(Lines[I].trim()));
+        Rows.push_back(internString(Lines[I].trim(), Arena));
         ++I;
       }
-      MDNode Table;
-      Table.Kind = NodeKind::NK_Table;
-      Table.Content = {};
-      Table.Children = allocateNodes(Rows, Arena);
-      LDBG() << "emitting NK_Table rows=" << Rows.size();
+      auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
+      LDBG() << "emitting TableNode rows=" << Rows.size();
       Nodes.push_back(Table);
       continue;
     }
 
     // Unordered list item.
     if (isListItem(Line)) {
-      SmallVector<MDNode> Items;
+      SmallVector<ListItemNode *> Items;
       while (I < E) {
         StringRef L = Lines[I].trim();
         if (!isListItem(L))
           break;
-        MDNode Item;
-        Item.Kind = NodeKind::NK_ListItem;
-        Item.Content = L.drop_front(2).trim();
-        Item.Children = {};
+        StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
+        SmallVector<MDNode *> ItemChildren;
+        ItemChildren.push_back(new (Arena) TextNode(ItemText));
+        auto *Item =
+            new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
         Items.push_back(Item);
         ++I;
       }
-      MDNode List;
-      List.Kind = NodeKind::NK_UnorderedList;
-      List.Content = {};
-      List.Children = allocateNodes(Items, Arena);
-      LDBG() << "emitting NK_UnorderedList items=" << Items.size();
+      auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
+      LDBG() << "emitting UnorderedListNode items=" << Items.size();
       Nodes.push_back(List);
       continue;
     }
 
     // Plain text fallback.
-    Nodes.push_back(makeText(Line));
+    Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
     ++I;
   }
 
   LDBG() << "parseMarkdown done nodes=" << Nodes.size();
-  return allocateNodes(Nodes, Arena);
+  return allocateArray(Nodes, Arena);
 }
 
-} // namespace clang::doc::markdown
+} // namespace clang::doc::markdown
\ No newline at end of file
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 09b79cc8f2437..3d457bcddfac6 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -7,30 +7,50 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines a standalone Markdown parsing library for the LLVM
-/// ecosystem. The parser takes plain text and returns a tree of typed nodes
-/// with no knowledge of comments, Doxygen, or Clang-Doc internals.
+/// Standalone Markdown parsing library for the LLVM ecosystem.
 ///
-/// This is a simple Markdown parser for use inside Clang-Doc's comment
-/// pipeline. You give it a paragraph of text and an arena allocator, and it
-/// gives back a list of typed nodes describing the Markdown structure it found.
+/// The parser takes plain paragraph text and returns a polymorphic tree of
+/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
+/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
+/// type carries exactly the fields it needs -- no overloaded Content field,
+/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting; each concrete type provides classof() for this purpose.
 ///
-/// The main entry point is parseMarkdown(). If the text has no Markdown in it,
-/// you get back an empty list and can fall back to plain-text output. If it
-/// does, you get a tree of MDNode structs where each node has a kind, optional
-/// content (like the language tag on a code fence), and optional children.
+/// See
+/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
 ///
-/// All nodes are allocated in the arena you pass in. You own the arena and are
-/// responsible for keeping it alive as long as you use the nodes.
+/// Field ordering in each derived struct is chosen to minimize padding:
+/// 4-byte fields (like Level or Start) are declared before 16-byte fields
+/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
+/// base class's 4-byte Kind and the first derived field.
 ///
-/// The parser handles fenced code blocks, pipe tables, and unordered lists.
-/// Anything it does not recognize comes back as a plain text node. It will
-/// never crash on bad input.
+/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
+///   TextNode       -- plain text run
+///   SoftBreakNode  -- soft line break
+///   HardBreakNode  -- hard line break (trailing spaces or backslash)
+///   InlineCodeNode -- inline code span (`code`)
+///   EmphasisNode   -- emphasis (*text* or _text_)
+///   StrongNode     -- strong emphasis (**text** or __text__)
+///
+/// Block nodes:
+///   ParagraphNode     -- sequence of inline nodes
+///   HeadingNode       -- ATX heading (# through ######), level 1-6
+///   FencedCodeNode    -- fenced code block (``` or ~~~)
+///   TableNode         -- pipe table (raw row text; TODO: structured cells)
+///   UnorderedListNode -- bullet list (-, *, +)
+///   OrderedListNode   -- numbered list with explicit start number
+///   ListItemNode      -- single item inside a list
+///   BlockQuoteNode    -- block quote (>)
+///   ThematicBreakNode -- horizontal rule (---, ***, ___)
+///
+/// All nodes are arena-allocated. The caller owns the arena and must keep it
+/// alive for the lifetime of any returned nodes. The parser never crashes on
+/// malformed input; unrecognized text falls back to TextNode.
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
-#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
@@ -38,35 +58,217 @@
 
 namespace clang::doc::markdown {
 
+/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
+/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
+/// cheap range-based checks in classof() implementations.
 enum class NodeKind {
+  // Inline nodes
+  NK_Text,
+  NK_SoftBreak,
+  NK_HardBreak,
+  NK_InlineCode,
+  NK_Emphasis,
+  NK_Strong,
+  NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+
   // Block nodes
   NK_Paragraph,
+  NK_Heading,
   NK_FencedCode,
   NK_Table,
   NK_UnorderedList,
   NK_OrderedList,
   NK_ListItem,
+  NK_BlockQuote,
   NK_ThematicBreak,
-  // Inline nodes
-  NK_Text,
-  NK_InlineCode,
-  NK_Emphasis,
-  NK_Strong,
-  NK_SoftBreak,
+  NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
 };
 
+/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
+/// Nodes are arena-allocated and have no virtual destructor; use
+/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
 struct MDNode {
   NodeKind Kind;
-  llvm::StringRef Content; // lang tag for FencedCode, leaf text for Text
-  llvm::ArrayRef<MDNode> Children; // arena allocated
+  explicit MDNode(NodeKind K) : Kind(K) {}
+};
+
+//===----------------------------------------------------------------------===//
+// Inline nodes
+//===----------------------------------------------------------------------===//
+
+/// Plain text run.
+struct TextNode : MDNode {
+  llvm::StringRef Text;
+  explicit TextNode(llvm::StringRef Text)
+      : MDNode(NodeKind::NK_Text), Text(Text) {}
+  static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
+};
+
+/// Soft line break -- a newline that does not end the paragraph.
+struct SoftBreakNode : MDNode {
+  SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_SoftBreak;
+  }
+};
+
+/// Hard line break -- two trailing spaces or a backslash before a newline.
+struct HardBreakNode : MDNode {
+  HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_HardBreak;
+  }
+};
+
+/// Inline code span: `code`. Code does not include the surrounding backticks.
+struct InlineCodeNode : MDNode {
+  llvm::StringRef Code;
+  explicit InlineCodeNode(llvm::StringRef Code)
+      : MDNode(NodeKind::NK_InlineCode), Code(Code) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_InlineCode;
+  }
+};
+
+/// Emphasized text: *text* or _text_.
+struct EmphasisNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit EmphasisNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_Emphasis), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_Emphasis;
+  }
 };
 
-/// Parses Markdown from a single comment paragraph's text.
-/// Returns an empty ArrayRef if no Markdown constructs are found,
-/// so generators can fall back to plain-text rendering at zero cost.
-llvm::ArrayRef<MDNode> parseMarkdown(llvm::StringRef ParagraphText,
-                                     llvm::BumpPtrAllocator &Arena);
+/// Strongly emphasized text: **text** or __text__.
+struct StrongNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit StrongNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_Strong), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_Strong;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Block nodes
+//===----------------------------------------------------------------------===//
+
+/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// blank lines.
+struct ParagraphNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit ParagraphNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_Paragraph), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_Paragraph;
+  }
+};
+
+/// ATX heading: one to six leading # characters. Level is declared before
+/// Children to avoid padding between the base class's 4-byte Kind and the
+/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+struct HeadingNode : MDNode {
+  unsigned Level;                    // 1-6
+  llvm::ArrayRef<MDNode *> Children; // inline content
+  HeadingNode(unsigned Level, llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_Heading), Level(Level), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_Heading;
+  }
+};
+
+/// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g.
+/// "cpp"); empty when no language was specified. Lines contains the raw text
+/// of each interior line, without the opening or closing fence.
+///
+/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up
+/// to 3 spaces; the closing fence must use the same character and be at least
+/// as long as the opening fence; only spaces may follow the closing fence.
+struct FencedCodeNode : MDNode {
+  llvm::StringRef Lang;
+  llvm::ArrayRef<llvm::StringRef> Lines;
+  FencedCodeNode(llvm::StringRef Lang, llvm::ArrayRef<llvm::StringRef> Lines)
+      : MDNode(NodeKind::NK_FencedCode), Lang(Lang), Lines(Lines) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_FencedCode;
+  }
+};
+
+/// Pipe table. Rows contains the raw text of each row line including the
+/// header and separator rows.
+/// TODO: replace with a structured header/body/cell representation.
+struct TableNode : MDNode {
+  llvm::ArrayRef<llvm::StringRef> Rows;
+  explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows)
+      : MDNode(NodeKind::NK_Table), Rows(Rows) {}
+  static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
+};
+
+/// A single list item. Children may contain block-level nodes for loose
+/// lists, or a single inline sequence for tight lists.
+struct ListItemNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit ListItemNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_ListItem), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_ListItem;
+  }
+};
+
+/// Unordered (bullet) list. Markers are -, *, or +.
+struct UnorderedListNode : MDNode {
+  llvm::ArrayRef<ListItemNode *> Items;
+  explicit UnorderedListNode(llvm::ArrayRef<ListItemNode *> Items)
+      : MDNode(NodeKind::NK_UnorderedList), Items(Items) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_UnorderedList;
+  }
+};
+
+/// Ordered (numbered) list. Start is the number on the first item. Start is
+/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+struct OrderedListNode : MDNode {
+  unsigned Start;
+  llvm::ArrayRef<ListItemNode *> Items;
+  OrderedListNode(unsigned Start, llvm::ArrayRef<ListItemNode *> Items)
+      : MDNode(NodeKind::NK_OrderedList), Start(Start), Items(Items) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_OrderedList;
+  }
+};
+
+/// Block quote (> ...). Children are block-level nodes inside the quote.
+struct BlockQuoteNode : MDNode {
+  llvm::ArrayRef<MDNode *> Children;
+  explicit BlockQuoteNode(llvm::ArrayRef<MDNode *> Children)
+      : MDNode(NodeKind::NK_BlockQuote), Children(Children) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_BlockQuote;
+  }
+};
+
+/// Thematic break: a line of three or more ---, ***, or ___ characters.
+struct ThematicBreakNode : MDNode {
+  ThematicBreakNode() : MDNode(NodeKind::NK_ThematicBreak) {}
+  static bool classof(const MDNode *N) {
+    return N->Kind == NodeKind::NK_ThematicBreak;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Parser entry point
+//===----------------------------------------------------------------------===//
+
+/// Parse Markdown from a single paragraph of plain text. Returns a list of
+/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no
+/// Markdown constructs are found, letting callers fall back to plain-text
+/// rendering at zero cost. The parser never crashes on malformed input.
+///
+/// The caller must keep Arena alive for the lifetime of any returned nodes.
+llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
+                                       llvm::BumpPtrAllocator &Arena);
 
 } // namespace clang::doc::markdown
 
-#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MARKDOWN_H
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 4ca979c1f1d24..b61094f034375 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -8,9 +8,11 @@
 
 #include "support/Markdown.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "gtest/gtest.h"
 
 using namespace clang::doc::markdown;
+using namespace llvm;
 
 namespace {
 
@@ -31,9 +33,8 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
 TEST_F(MarkdownParserTest, PlainText) {
   auto Nodes = parseMarkdown("hello world", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_Text);
-  EXPECT_EQ(N.Content, "hello world");
+  auto *N = cast<TextNode>(Nodes[0]);
+  EXPECT_EQ(N->Text, "hello world");
 }
 
 TEST_F(MarkdownParserTest, FencedCodeBlock) {
@@ -42,10 +43,9 @@ int x = 0;
 ````````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(N.Content, "cpp");
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "cpp");
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 TEST_F(MarkdownParserTest, FencedCodeBlockNoLang) {
@@ -54,9 +54,8 @@ some code
 ```````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_TRUE(N.Content.empty());
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_TRUE(N->Lang.empty());
 }
 
 TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
@@ -74,7 +73,7 @@ TEST_F(MarkdownParserTest, PipeTable) {
 | 1 | 2 |)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(Nodes[0].Kind, NodeKind::NK_Table);
+  EXPECT_TRUE(isa<TableNode>(Nodes[0]));
 }
 
 TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
@@ -82,8 +81,8 @@ TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {
 c | d)",
                              Arena);
   // No separator row so should not be parsed as a table.
-  for (const auto &Node : Nodes)
-    EXPECT_NE(Node.Kind, NodeKind::NK_Table);
+  for (const auto *Node : Nodes)
+    EXPECT_FALSE(isa<TableNode>(Node));
 }
 
 TEST_F(MarkdownParserTest, UnorderedList) {
@@ -92,12 +91,11 @@ TEST_F(MarkdownParserTest, UnorderedList) {
 - baz)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_UnorderedList);
-  ASSERT_EQ(N.Children.size(), 3u);
-  EXPECT_EQ(N.Children[0].Content, "foo");
-  EXPECT_EQ(N.Children[1].Content, "bar");
-  EXPECT_EQ(N.Children[2].Content, "baz");
+  auto *N = cast<UnorderedListNode>(Nodes[0]);
+  ASSERT_EQ(N->Items.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+  EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar");
+  EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz");
 }
 
 TEST_F(MarkdownParserTest, MixedContent) {
@@ -117,10 +115,9 @@ int x = 0;
 ~~~)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_TRUE(N.Content.empty());
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_TRUE(N->Lang.empty());
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 // CommonMark §4.5 example 120: tilde fence with a language tag.
@@ -130,10 +127,9 @@ int x = 0;
 ~~~)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(N.Content, "cpp");
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "cpp");
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 // CommonMark §4.5 example 122: a tilde line does not close a backtick fence.
@@ -144,10 +140,9 @@ aaa
 ````````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
   // ~~~ is content, not a closing fence.
-  ASSERT_EQ(N.Children.size(), 2u);
+  ASSERT_EQ(N->Lines.size(), 2u);
 }
 
 // CommonMark §4.5 example 130: a code block can be empty.
@@ -156,18 +151,16 @@ TEST_F(MarkdownParserTest, EmptyFencedCodeBlock) {
 ```````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_TRUE(N.Children.empty());
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_TRUE(N->Lines.empty());
 }
 
 // CommonMark §4.5 example 129: a code block may contain only blank lines.
 TEST_F(MarkdownParserTest, FencedCodeBlockBlankLineContent) {
   auto Nodes = parseMarkdown("```\n\n  \n```", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  ASSERT_EQ(N.Children.size(), 2u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  ASSERT_EQ(N->Lines.size(), 2u);
 }
 
 // CommonMark §4.5 example 142: lang tag is captured from the info string.
@@ -179,10 +172,9 @@ end
 ``````)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(N.Content, "ruby");
-  ASSERT_EQ(N.Children.size(), 3u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "ruby");
+  ASSERT_EQ(N->Lines.size(), 3u);
 }
 
 // CommonMark §4.5 example 146: tilde fence info string may contain backticks.
@@ -192,10 +184,9 @@ foo
 ~~~)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  EXPECT_EQ(N.Content, "aa ``` ~~~");
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "aa ``` ~~~");
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 // CommonMark §4.5 example 124: closing fence must be at least as long as the
@@ -209,9 +200,8 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
   // parser currently treats it as a closing fence. This test documents the
   // current (non-conformant) behavior.
   ASSERT_EQ(Nodes.size(), 1u);
-  const auto &N = Nodes[0];
-  EXPECT_EQ(N.Kind, NodeKind::NK_FencedCode);
-  ASSERT_EQ(N.Children.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  ASSERT_EQ(N->Lines.size(), 1u);
 }
 
 } // namespace
\ No newline at end of file

>From 25fe7daff183f51a8b31ed0d8481b9a2f1fbdbd8 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 02:33:44 -0400
Subject: [PATCH 05/27] [clang-doc] Introduce LineReader cursor for the
 Markdown parse loop

Replace the raw size_t I = 0, E = Lines.size() index arithmetic in
parseMarkdown() with a LineReader cursor that encapsulates the position
and exposes peek(), peek(Offset), advance(), and atEnd(). The parse
logic and emitted nodes are unchanged; this only removes manual index
bookkeeping. All 18 MarkdownParserTest cases still pass.

Co-Authored-By: Claude Opus 4.8 <noreply at anthropic.com>
---
 .../clang-doc/support/Markdown.cpp            | 73 ++++++++++++++-----
 1 file changed, 54 insertions(+), 19 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index bee15c3e23ec3..f171457e73046 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLog.h"
+#include <cassert>
 
 #define DEBUG_TYPE "clang-doc"
 
@@ -52,6 +53,42 @@ static bool isListItem(StringRef Line) {
          Line.starts_with("+ ");
 }
 
+// A forward cursor over the lines of a paragraph. Encapsulates the parse
+// position so the loop can inspect the current or an upcoming line and consume
+// lines without manual index arithmetic. Lines are stored untrimmed; callers
+// trim where they need a normalized view.
+class LineReader {
+public:
+  explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+  // True once every line has been consumed.
+  bool atEnd() const { return Pos >= Lines.size(); }
+
+  // The current line, untrimmed. Must not be called when atEnd().
+  StringRef peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return Lines[Pos];
+  }
+
+  // The line Offset positions ahead of the cursor, or an empty StringRef when
+  // that position is past the end. peek(0) is the current line.
+  StringRef peek(size_t Offset) const {
+    size_t Target = Pos + Offset;
+    return Target < Lines.size() ? Lines[Target] : StringRef();
+  }
+
+  // Consume the current line and return it, untrimmed. Must not be called when
+  // atEnd().
+  StringRef advance() {
+    assert(!atEnd() && "advance past end of input");
+    return Lines[Pos++];
+  }
+
+private:
+  ArrayRef<StringRef> Lines;
+  size_t Pos = 0;
+};
+
 ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
                                  BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
@@ -61,13 +98,13 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
   ParagraphText.split(Lines, '\n');
 
   SmallVector<MDNode *> Nodes;
-  size_t I = 0, E = Lines.size();
+  LineReader Reader(Lines);
 
-  while (I < E) {
-    StringRef Line = Lines[I].trim();
+  while (!Reader.atEnd()) {
+    StringRef Line = Reader.peek().trim();
 
     if (Line.empty()) {
-      ++I;
+      Reader.advance();
       continue;
     }
 
@@ -79,18 +116,18 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     if (Line.starts_with("```") || Line.starts_with("~~~")) {
       char Fence = Line[0];
       StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+      Reader.advance(); // consume opening fence
       SmallVector<StringRef> CodeLines;
-      ++I;
-      while (I < E) {
-        StringRef CodeLine = Lines[I].trim();
+      while (!Reader.atEnd()) {
+        StringRef CodeLine = Reader.peek().trim();
         if (CodeLine.size() >= 3 &&
             all_of(CodeLine.take_front(3),
                    [Fence](char C) { return C == Fence; }))
           break;
-        CodeLines.push_back(internString(Lines[I], Arena));
-        ++I;
+        CodeLines.push_back(internString(Reader.advance(), Arena));
       }
-      ++I; // skip closing fence
+      if (!Reader.atEnd())
+        Reader.advance(); // consume closing fence
       auto *Code =
           new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
       LDBG() << "emitting FencedCodeNode lang='" << Lang
@@ -100,12 +137,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     }
 
     // Pipe table: current line has | and next line is a separator row.
-    if (Line.contains('|') && I + 1 < E && isSepRow(Lines[I + 1].trim())) {
+    if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
       SmallVector<StringRef> Rows;
-      while (I < E && Lines[I].trim().contains('|')) {
-        Rows.push_back(internString(Lines[I].trim(), Arena));
-        ++I;
-      }
+      while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
+        Rows.push_back(internString(Reader.advance().trim(), Arena));
       auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
       LDBG() << "emitting TableNode rows=" << Rows.size();
       Nodes.push_back(Table);
@@ -115,8 +150,8 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     // Unordered list item.
     if (isListItem(Line)) {
       SmallVector<ListItemNode *> Items;
-      while (I < E) {
-        StringRef L = Lines[I].trim();
+      while (!Reader.atEnd()) {
+        StringRef L = Reader.peek().trim();
         if (!isListItem(L))
           break;
         StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
@@ -125,7 +160,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
         auto *Item =
             new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
         Items.push_back(Item);
-        ++I;
+        Reader.advance();
       }
       auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
       LDBG() << "emitting UnorderedListNode items=" << Items.size();
@@ -135,7 +170,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
 
     // Plain text fallback.
     Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
-    ++I;
+    Reader.advance();
   }
 
   LDBG() << "parseMarkdown done nodes=" << Nodes.size();

>From 060bf63fe9f19fa45ef941f10594897351591d56 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 02:44:27 -0400
Subject: [PATCH 06/27] [clang-doc] Parse inline emphasis, strong, and code in
 Markdown
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an inline pass over paragraph text that recognizes emphasis
(*text* or _text_), strong (**text** or __text__), and inline code
(`code`), emitting the EmphasisNode, StrongNode, and InlineCodeNode
types already in the hierarchy. Emphasis and strong recurse into their
content, and runs that match no construct stay plain TextNodes.

Delimiter matching uses a simplified subset of the CommonMark §6
flanking rules: a delimiter opens only with non-whitespace inside it and
closes only with non-whitespace before it, and code spans close on a
backtick run of equal length. The full delimiter-stack model is left as
a TODO. Adds 12 unit tests covering each construct plus the unmatched
and unterminated cases.

Co-Authored-By: Claude Opus 4.8 <noreply at anthropic.com>
---
 .../clang-doc/support/Markdown.cpp            | 121 +++++++++++++++++-
 .../clang-doc/MarkdownParserTest.cpp          |  97 ++++++++++++++
 2 files changed, 216 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index f171457e73046..f1af4f5430772 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -8,6 +8,7 @@
 
 #include "Markdown.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLog.h"
@@ -89,6 +90,121 @@ class LineReader {
   size_t Pos = 0;
 };
 
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+  size_t I = Start;
+  while (I < S.size() && S[I] == C)
+    ++I;
+  return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+  if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+      Code.find_first_not_of(' ') != StringRef::npos)
+    return Code.drop_front().drop_back();
+  return Code;
+}
+
+// Finds the start index of a closing emphasis run of exactly Count copies of C,
+// searching forward from From. Requires non-whitespace immediately inside both
+// the opening and closing delimiters and non-empty content, a simplified take
+// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
+// closing run exists.
+static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
+  size_t E = S.size();
+  // Opening delimiter is not left-flanking if whitespace follows it.
+  if (From >= E || isSpace(S[From]))
+    return StringRef::npos;
+  for (size_t J = From; J + Count <= E; ++J) {
+    if (S[J] != C)
+      continue;
+    size_t Run = countRun(S, J, C);
+    if (Run != Count) {
+      J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
+      continue;
+    }
+    // Reject empty content and closing runs that are not right-flanking.
+    if (J == From || isSpace(S[J - 1]))
+      continue;
+    return J;
+  }
+  return StringRef::npos;
+}
+
+// Parses the inline content of a single line into a sequence of inline nodes:
+// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or
+// _text_). Runs that match no construct become TextNodes. Emphasis and strong
+// recurse so their content may itself contain inline constructs. Text with no
+// markers yields a single TextNode.
+//
+// TODO: This covers the common cases but not the full CommonMark §6 inline
+// model (delimiter stacks, intraword underscore rules, links, autolinks).
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
+  SmallVector<MDNode *> Nodes;
+  size_t TextStart = 0, I = 0, E = S.size();
+
+  auto flushText = [&](size_t End) {
+    if (End > TextStart)
+      Nodes.push_back(new (Arena) TextNode(
+          internString(S.substr(TextStart, End - TextStart), Arena)));
+  };
+
+  while (I < E) {
+    char C = S[I];
+
+    // Inline code span: a run of N backticks closed by a run of N backticks.
+    if (C == '`') {
+      size_t N = countRun(S, I, '`');
+      size_t J = I + N;
+      while (J < E && countRun(S, J, '`') != N)
+        J += S[J] == '`' ? countRun(S, J, '`') : 1;
+      if (J < E) {
+        flushText(I);
+        StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+        Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
+        I = J + N;
+        TextStart = I;
+        continue;
+      }
+      // No closing run; leave the backticks as literal text.
+      I += N;
+      continue;
+    }
+
+    // Emphasis (*text*, _text_) and strong (**text**, __text__).
+    if (C == '*' || C == '_') {
+      // Strong binds the two-delimiter form before single-delimiter emphasis.
+      if (I + 1 < E && S[I + 1] == C) {
+        size_t Close = findClosingDelim(S, I + 2, C, 2);
+        if (Close != StringRef::npos) {
+          flushText(I);
+          StringRef Inner = S.substr(I + 2, Close - (I + 2));
+          Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
+          I = Close + 2;
+          TextStart = I;
+          continue;
+        }
+      }
+      size_t Close = findClosingDelim(S, I + 1, C, 1);
+      if (Close != StringRef::npos) {
+        flushText(I);
+        StringRef Inner = S.substr(I + 1, Close - (I + 1));
+        Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
+        I = Close + 1;
+        TextStart = I;
+        continue;
+      }
+    }
+
+    ++I;
+  }
+
+  flushText(E);
+  return allocateArray(Nodes, Arena);
+}
+
 ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
                                  BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
@@ -168,8 +284,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       continue;
     }
 
-    // Plain text fallback.
-    Nodes.push_back(new (Arena) TextNode(internString(Line, Arena)));
+    // Plain text, scanned for inline constructs (emphasis, strong, code).
+    for (MDNode *Inline : parseInline(Line, Arena))
+      Nodes.push_back(Inline);
     Reader.advance();
   }
 
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index b61094f034375..ea72dacfb08e5 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -204,4 +204,101 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
   ASSERT_EQ(N->Lines.size(), 1u);
 }
 
+TEST_F(MarkdownParserTest, EmphasisAsterisk) {
+  auto Nodes = parseMarkdown("an *important* word", Arena);
+  ASSERT_EQ(Nodes.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an ");
+  auto *Em = cast<EmphasisNode>(Nodes[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+  EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word");
+}
+
+TEST_F(MarkdownParserTest, EmphasisUnderscore) {
+  auto Nodes = parseMarkdown("_em_", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *Em = cast<EmphasisNode>(Nodes[0]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em");
+}
+
+TEST_F(MarkdownParserTest, StrongAsterisk) {
+  auto Nodes = parseMarkdown("**bold**", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *St = cast<StrongNode>(Nodes[0]);
+  ASSERT_EQ(St->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
+}
+
+TEST_F(MarkdownParserTest, StrongUnderscore) {
+  auto Nodes = parseMarkdown("__bold__", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *St = cast<StrongNode>(Nodes[0]);
+  ASSERT_EQ(St->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
+}
+
+// Two delimiters must be parsed as strong, not as nested emphasis.
+TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) {
+  auto Nodes = parseMarkdown("**strong**", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_TRUE(isa<StrongNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, InlineCode) {
+  auto Nodes = parseMarkdown("call `foo()` here", Arena);
+  ASSERT_EQ(Nodes.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call ");
+  EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()");
+  EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here");
+}
+
+// CommonMark §6.1: a doubled backtick fence lets the span contain a single
+// backtick.
+TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) {
+  auto Nodes = parseMarkdown("``a`b``", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b");
+}
+
+// Emphasis and strong recurse, so a code span inside emphasis is parsed.
+TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
+  auto Nodes = parseMarkdown("*see `x`*", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *Em = cast<EmphasisNode>(Nodes[0]);
+  ASSERT_EQ(Em->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see ");
+  EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x");
+}
+
+TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
+  auto Nodes = parseMarkdown("**a `b`**", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *St = cast<StrongNode>(Nodes[0]);
+  ASSERT_EQ(St->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a ");
+  EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b");
+}
+
+// A delimiter with whitespace on the inside does not open emphasis.
+TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) {
+  auto Nodes = parseMarkdown("a * b", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b");
+}
+
+// An unterminated code span leaves the backtick as literal text.
+TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) {
+  auto Nodes = parseMarkdown("a `b c", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c");
+}
+
+// Inline parsing must not disturb plain text with no markers.
+TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
+  auto Nodes = parseMarkdown("just words", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
+}
+
 } // namespace
\ No newline at end of file

>From 0af1c8e2999a20e2044cc337a8c4f0d8112d208b Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 10:18:32 -0400
Subject: [PATCH 07/27] [clang-doc] Address review feedback: rename inline
 parser variables, simplify header docs

---
 .../clang-doc/support/Markdown.cpp            | 54 ++++++++++---------
 .../clang-doc/support/Markdown.h              | 25 +++------
 2 files changed, 34 insertions(+), 45 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index f1af4f5430772..ef29daa76a166 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -143,7 +143,7 @@ static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
 // model (delimiter stacks, intraword underscore rules, links, autolinks).
 static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
   SmallVector<MDNode *> Nodes;
-  size_t TextStart = 0, I = 0, E = S.size();
+  size_t TextStart = 0, Pos = 0, E = S.size();
 
   auto flushText = [&](size_t End) {
     if (End > TextStart)
@@ -151,54 +151,56 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
           internString(S.substr(TextStart, End - TextStart), Arena)));
   };
 
-  while (I < E) {
-    char C = S[I];
+  while (Pos < E) {
+    char C = S[Pos];
 
-    // Inline code span: a run of N backticks closed by a run of N backticks.
+    // Inline code span: an opening backtick run closed by a run of the same
+    // length.
     if (C == '`') {
-      size_t N = countRun(S, I, '`');
-      size_t J = I + N;
-      while (J < E && countRun(S, J, '`') != N)
-        J += S[J] == '`' ? countRun(S, J, '`') : 1;
-      if (J < E) {
-        flushText(I);
-        StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+      size_t OpenLen = countRun(S, Pos, '`');
+      size_t ClosePos = Pos + OpenLen;
+      while (ClosePos < E && countRun(S, ClosePos, '`') != OpenLen)
+        ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
+      if (ClosePos < E) {
+        flushText(Pos);
+        StringRef Code =
+            trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
         Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
-        I = J + N;
-        TextStart = I;
+        Pos = ClosePos + OpenLen;
+        TextStart = Pos;
         continue;
       }
       // No closing run; leave the backticks as literal text.
-      I += N;
+      Pos += OpenLen;
       continue;
     }
 
     // Emphasis (*text*, _text_) and strong (**text**, __text__).
     if (C == '*' || C == '_') {
       // Strong binds the two-delimiter form before single-delimiter emphasis.
-      if (I + 1 < E && S[I + 1] == C) {
-        size_t Close = findClosingDelim(S, I + 2, C, 2);
+      if (Pos + 1 < E && S[Pos + 1] == C) {
+        size_t Close = findClosingDelim(S, Pos + 2, C, 2);
         if (Close != StringRef::npos) {
-          flushText(I);
-          StringRef Inner = S.substr(I + 2, Close - (I + 2));
+          flushText(Pos);
+          StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
           Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
-          I = Close + 2;
-          TextStart = I;
+          Pos = Close + 2;
+          TextStart = Pos;
           continue;
         }
       }
-      size_t Close = findClosingDelim(S, I + 1, C, 1);
+      size_t Close = findClosingDelim(S, Pos + 1, C, 1);
       if (Close != StringRef::npos) {
-        flushText(I);
-        StringRef Inner = S.substr(I + 1, Close - (I + 1));
+        flushText(Pos);
+        StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
         Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
-        I = Close + 1;
-        TextStart = I;
+        Pos = Close + 1;
+        TextStart = Pos;
         continue;
       }
     }
 
-    ++I;
+    ++Pos;
   }
 
   flushText(E);
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 3d457bcddfac6..60390465588c3 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -9,20 +9,10 @@
 /// \file
 /// Standalone Markdown parsing library for the LLVM ecosystem.
 ///
-/// The parser takes plain paragraph text and returns a polymorphic tree of
-/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
-/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
-/// type carries exactly the fields it needs -- no overloaded Content field,
-/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
-/// downcasting; each concrete type provides classof() for this purpose.
-///
-/// See
-/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
-///
-/// Field ordering in each derived struct is chosen to minimize padding:
-/// 4-byte fields (like Level or Start) are declared before 16-byte fields
-/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
-/// base class's 4-byte Kind and the first derived field.
+/// The parser takes a single paragraph of plain text and returns a list of
+/// nodes describing the Markdown it found. Each kind of construct has its own
+/// node type, and every node shares a common MDNode base, so you can use
+/// llvm::isa<>/cast<>/dyn_cast<> to check what a node is.
 ///
 /// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
 ///   TextNode       -- plain text run
@@ -165,9 +155,7 @@ struct ParagraphNode : MDNode {
   }
 };
 
-/// ATX heading: one to six leading # characters. Level is declared before
-/// Children to avoid padding between the base class's 4-byte Kind and the
-/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
+/// ATX heading: one to six leading # characters.
 struct HeadingNode : MDNode {
   unsigned Level;                    // 1-6
   llvm::ArrayRef<MDNode *> Children; // inline content
@@ -226,8 +214,7 @@ struct UnorderedListNode : MDNode {
   }
 };
 
-/// Ordered (numbered) list. Start is the number on the first item. Start is
-/// declared before Items to avoid padding, keeping sizeof at 24 bytes.
+/// Ordered (numbered) list. Start is the number on the first item.
 struct OrderedListNode : MDNode {
   unsigned Start;
   llvm::ArrayRef<ListItemNode *> Items;

>From b76bfa182db40e7a358ffb7d42506aff24453e14 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 11:50:32 -0400
Subject: [PATCH 08/27] [clang-doc] Add libFuzzer harness for parseMarkdown()

---
 clang-tools-extra/clang-doc/CMakeLists.txt    |  1 +
 .../clang-doc/fuzzer/CMakeLists.txt           | 21 +++++++++++++
 .../clang-doc/fuzzer/DummyMarkdownFuzzer.cpp  | 21 +++++++++++++
 .../clang-doc/fuzzer/FuzzMarkdown.cpp         | 30 +++++++++++++++++++
 4 files changed, 73 insertions(+)
 create mode 100644 clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
 create mode 100644 clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
 create mode 100644 clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp

diff --git a/clang-tools-extra/clang-doc/CMakeLists.txt b/clang-tools-extra/clang-doc/CMakeLists.txt
index 22e2c8159e9f6..f64d1129ed4af 100644
--- a/clang-tools-extra/clang-doc/CMakeLists.txt
+++ b/clang-tools-extra/clang-doc/CMakeLists.txt
@@ -44,6 +44,7 @@ target_link_libraries(clangDoc
   )
 
 add_subdirectory(tool)
+add_subdirectory(fuzzer)
 
 if (LLVM_INCLUDE_BENCHMARKS)
   add_subdirectory(benchmarks)
diff --git a/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
new file mode 100644
index 0000000000000..5e6e943891052
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/CMakeLists.txt
@@ -0,0 +1,21 @@
+# Resolve "support/Markdown.h" against the parent clang-doc directory.
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..)
+
+set(LLVM_LINK_COMPONENTS
+  FuzzerCLI
+  Support
+  )
+
+# This fuzzer runs on oss-fuzz, so keep it around even if it looks unreferenced.
+# With a fuzzing engine configured (LLVM_USE_SANITIZE_COVERAGE or an external
+# LLVM_LIB_FUZZING_ENGINE) this builds a real fuzz target; otherwise DUMMY_MAIN
+# provides a main() so it still builds and can be replayed over saved inputs.
+add_llvm_fuzzer(clang-doc-markdown-fuzzer
+  FuzzMarkdown.cpp
+  DUMMY_MAIN DummyMarkdownFuzzer.cpp
+  )
+
+target_link_libraries(clang-doc-markdown-fuzzer
+  PRIVATE
+  clangDocSupport
+  )
diff --git a/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
new file mode 100644
index 0000000000000..61466e0fa4ef6
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/DummyMarkdownFuzzer.cpp
@@ -0,0 +1,21 @@
+//===-- DummyMarkdownFuzzer.cpp - Entry point to test the fuzzer ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of main so we can build and test the harness without linking
+// libFuzzer. Each command line argument is treated as a file to run the
+// harness on.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/FuzzMutate/FuzzerCLI.h"
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size);
+
+int main(int argc, char *argv[]) {
+  return llvm::runFuzzerOnInputs(argc, argv, LLVMFuzzerTestOneInput);
+}
diff --git a/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
new file mode 100644
index 0000000000000..e407b3baccf2e
--- /dev/null
+++ b/clang-tools-extra/clang-doc/fuzzer/FuzzMarkdown.cpp
@@ -0,0 +1,30 @@
+//===-- FuzzMarkdown.cpp - Fuzzer for the clang-doc Markdown parser -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements a libFuzzer harness for parseMarkdown(). It feeds
+/// arbitrary bytes to the parser and checks that it never crashes. The parsed
+/// nodes are walked so the returned tree is exercised, not just allocated.
+///
+//===----------------------------------------------------------------------===//
+
+#include "support/Markdown.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include <cstddef>
+#include <cstdint>
+
+using namespace clang::doc::markdown;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  llvm::BumpPtrAllocator Arena;
+  llvm::StringRef Input(reinterpret_cast<const char *>(Data), Size);
+  for (const MDNode *Node : parseMarkdown(Input, Arena))
+    (void)Node->Kind;
+  return 0;
+}

>From 77e28993d7a167410fd1a1ee97d2824945b44063 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 18:47:47 -0400
Subject: [PATCH 09/27] [clang-doc] Address review feedback: rename
 findClosingDelim params, add table TODO, fix EOF newline

---
 .../clang-doc/support/Markdown.cpp            | 28 +++++++++++--------
 .../clang-doc/MarkdownParserTest.cpp          |  2 +-
 2 files changed, 17 insertions(+), 13 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index ef29daa76a166..6a57cd7900ea2 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -107,26 +107,27 @@ static StringRef trimCodeSpan(StringRef Code) {
   return Code;
 }
 
-// Finds the start index of a closing emphasis run of exactly Count copies of C,
-// searching forward from From. Requires non-whitespace immediately inside both
-// the opening and closing delimiters and non-empty content, a simplified take
-// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
-// closing run exists.
-static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
+// Finds the start index of a closing emphasis run of exactly DelimLen copies of
+// DelimChar, searching forward from StartPos. Requires non-whitespace
+// immediately inside both the opening and closing delimiters and non-empty
+// content, a simplified take on the CommonMark §6.2 flanking rules. Returns
+// StringRef::npos if no valid closing run exists.
+static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
+                               size_t DelimLen) {
   size_t E = S.size();
   // Opening delimiter is not left-flanking if whitespace follows it.
-  if (From >= E || isSpace(S[From]))
+  if (StartPos >= E || isSpace(S[StartPos]))
     return StringRef::npos;
-  for (size_t J = From; J + Count <= E; ++J) {
-    if (S[J] != C)
+  for (size_t J = StartPos; J + DelimLen <= E; ++J) {
+    if (S[J] != DelimChar)
       continue;
-    size_t Run = countRun(S, J, C);
-    if (Run != Count) {
+    size_t Run = countRun(S, J, DelimChar);
+    if (Run != DelimLen) {
       J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
       continue;
     }
     // Reject empty content and closing runs that are not right-flanking.
-    if (J == From || isSpace(S[J - 1]))
+    if (J == StartPos || isSpace(S[J - 1]))
       continue;
     return J;
   }
@@ -257,6 +258,9 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     // Pipe table: current line has | and next line is a separator row.
     if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
       SmallVector<StringRef> Rows;
+      // TODO: Rows are kept as raw line text for now. Table cells may contain
+      // inline content (emphasis, code spans, links), so each row may need to
+      // be split on '|' and parsed further into structured cells.
       while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
         Rows.push_back(internString(Reader.advance().trim(), Arena));
       auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index ea72dacfb08e5..28bb9d567e6bc 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -301,4 +301,4 @@ TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
   EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
 }
 
-} // namespace
\ No newline at end of file
+} // namespace

>From f33ef2ce3f9292e10f1e1dd220a500070ef21bc5 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:29:15 -0400
Subject: [PATCH 10/27] [clang-doc] Address review feedback: make
 UnterminatedFence and MixedContent tests explicit

---
 .../unittests/clang-doc/MarkdownParserTest.cpp  | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 28bb9d567e6bc..207ae938c299a 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -58,13 +58,17 @@ some code
   EXPECT_TRUE(N->Lang.empty());
 }
 
-TEST_F(MarkdownParserTest, UnterminatedFenceReturnsEmpty) {
+TEST_F(MarkdownParserTest, UnterminatedFenceProducesCodeNode) {
   auto Nodes = parseMarkdown(R"(```cpp
 int x = 0;)",
                              Arena);
-  // Unterminated fence should not crash and should produce a code node
-  // with whatever lines were found.
-  EXPECT_FALSE(Nodes.empty());
+  // An unterminated fence should not crash. The parser falls back to emitting a
+  // FencedCodeNode with whatever lines were found before the end of input.
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *N = cast<FencedCodeNode>(Nodes[0]);
+  EXPECT_EQ(N->Lang, "cpp");
+  ASSERT_EQ(N->Lines.size(), 1u);
+  EXPECT_EQ(N->Lines[0], "int x = 0;");
 }
 
 TEST_F(MarkdownParserTest, PipeTable) {
@@ -105,7 +109,10 @@ code
 ````````
 - item)",
                              Arena);
-  EXPECT_EQ(Nodes.size(), 3u);
+  ASSERT_EQ(Nodes.size(), 3u);
+  EXPECT_TRUE(isa<TextNode>(Nodes[0]));
+  EXPECT_TRUE(isa<FencedCodeNode>(Nodes[1]));
+  EXPECT_TRUE(isa<UnorderedListNode>(Nodes[2]));
 }
 
 // CommonMark §4.5 example 120: tilde fences work the same as backtick fences.

>From 4371be42e6ccb7a955301c77b5b732e45675347d Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:35:54 -0400
Subject: [PATCH 11/27] [clang-doc] Replace internString with
 llvm::StringSaver, matching Mustache pattern

---
 .../clang-doc/support/Markdown.cpp            | 34 ++++++++-----------
 1 file changed, 15 insertions(+), 19 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 6a57cd7900ea2..be2800bff5df7 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/DebugLog.h"
+#include "llvm/Support/StringSaver.h"
 #include <cassert>
 
 #define DEBUG_TYPE "clang-doc"
@@ -31,15 +32,6 @@ static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
   return ArrayRef<T>(Allocated, Vec.size());
 }
 
-// Interns a StringRef into the arena so it outlives the parse loop.
-static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
-  if (S.empty())
-    return {};
-  char *Buf = Arena.Allocate<char>(S.size());
-  std::copy(S.begin(), S.end(), Buf);
-  return StringRef(Buf, S.size());
-}
-
 // A line is a table separator if it only contains |, -, :, and spaces,
 // and has at least one -.
 static bool isSepRow(StringRef Line) {
@@ -142,14 +134,15 @@ static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
 //
 // TODO: This covers the common cases but not the full CommonMark §6 inline
 // model (delimiter stacks, intraword underscore rules, links, autolinks).
-static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
+                                      StringSaver &Saver) {
   SmallVector<MDNode *> Nodes;
   size_t TextStart = 0, Pos = 0, E = S.size();
 
   auto flushText = [&](size_t End) {
     if (End > TextStart)
       Nodes.push_back(new (Arena) TextNode(
-          internString(S.substr(TextStart, End - TextStart), Arena)));
+          Saver.save(S.substr(TextStart, End - TextStart))));
   };
 
   while (Pos < E) {
@@ -166,7 +159,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
         flushText(Pos);
         StringRef Code =
             trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
-        Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
+        Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
         Pos = ClosePos + OpenLen;
         TextStart = Pos;
         continue;
@@ -184,7 +177,8 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
         if (Close != StringRef::npos) {
           flushText(Pos);
           StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
-          Nodes.push_back(new (Arena) StrongNode(parseInline(Inner, Arena)));
+          Nodes.push_back(new (Arena)
+                              StrongNode(parseInline(Inner, Arena, Saver)));
           Pos = Close + 2;
           TextStart = Pos;
           continue;
@@ -194,7 +188,8 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
       if (Close != StringRef::npos) {
         flushText(Pos);
         StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
-        Nodes.push_back(new (Arena) EmphasisNode(parseInline(Inner, Arena)));
+        Nodes.push_back(new (Arena)
+                            EmphasisNode(parseInline(Inner, Arena, Saver)));
         Pos = Close + 1;
         TextStart = Pos;
         continue;
@@ -213,6 +208,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
   if (ParagraphText.trim().empty())
     return {};
 
+  StringSaver Saver(Arena);
   SmallVector<StringRef, 16> Lines;
   ParagraphText.split(Lines, '\n');
 
@@ -234,7 +230,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     // case-by-case basis.
     if (Line.starts_with("```") || Line.starts_with("~~~")) {
       char Fence = Line[0];
-      StringRef Lang = internString(Line.drop_front(3).trim(), Arena);
+      StringRef Lang = Saver.save(Line.drop_front(3).trim());
       Reader.advance(); // consume opening fence
       SmallVector<StringRef> CodeLines;
       while (!Reader.atEnd()) {
@@ -243,7 +239,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
             all_of(CodeLine.take_front(3),
                    [Fence](char C) { return C == Fence; }))
           break;
-        CodeLines.push_back(internString(Reader.advance(), Arena));
+        CodeLines.push_back(Saver.save(Reader.advance()));
       }
       if (!Reader.atEnd())
         Reader.advance(); // consume closing fence
@@ -262,7 +258,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       // inline content (emphasis, code spans, links), so each row may need to
       // be split on '|' and parsed further into structured cells.
       while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
-        Rows.push_back(internString(Reader.advance().trim(), Arena));
+        Rows.push_back(Saver.save(Reader.advance().trim()));
       auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
       LDBG() << "emitting TableNode rows=" << Rows.size();
       Nodes.push_back(Table);
@@ -276,7 +272,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
         StringRef L = Reader.peek().trim();
         if (!isListItem(L))
           break;
-        StringRef ItemText = internString(L.drop_front(2).trim(), Arena);
+        StringRef ItemText = Saver.save(L.drop_front(2).trim());
         SmallVector<MDNode *> ItemChildren;
         ItemChildren.push_back(new (Arena) TextNode(ItemText));
         auto *Item =
@@ -291,7 +287,7 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
     }
 
     // Plain text, scanned for inline constructs (emphasis, strong, code).
-    for (MDNode *Inline : parseInline(Line, Arena))
+    for (MDNode *Inline : parseInline(Line, Arena, Saver))
       Nodes.push_back(Inline);
     Reader.advance();
   }

>From 0b5f53715fc6e78a56145609893fa61f5cf4f353 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 22:59:09 -0400
Subject: [PATCH 12/27] [clang-doc] Address review feedback: fix comment
 accuracy and trim AI-sounding language

---
 clang-tools-extra/clang-doc/support/Markdown.h      | 12 +++++-------
 .../unittests/clang-doc/MarkdownParserTest.cpp      | 13 +++++--------
 2 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 60390465588c3..8c2055868671a 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -49,8 +49,7 @@
 namespace clang::doc::markdown {
 
 /// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
-/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
-/// cheap range-based checks in classof() implementations.
+/// block kinds.
 enum class NodeKind {
   // Inline nodes
   NK_Text,
@@ -193,8 +192,7 @@ struct TableNode : MDNode {
   static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
 };
 
-/// A single list item. Children may contain block-level nodes for loose
-/// lists, or a single inline sequence for tight lists.
+/// A single list item. Children holds the item's inline content.
 struct ListItemNode : MDNode {
   llvm::ArrayRef<MDNode *> Children;
   explicit ListItemNode(llvm::ArrayRef<MDNode *> Children)
@@ -248,9 +246,9 @@ struct ThematicBreakNode : MDNode {
 //===----------------------------------------------------------------------===//
 
 /// Parse Markdown from a single paragraph of plain text. Returns a list of
-/// top-level block nodes allocated in Arena. Returns an empty ArrayRef if no
-/// Markdown constructs are found, letting callers fall back to plain-text
-/// rendering at zero cost. The parser never crashes on malformed input.
+/// top-level nodes allocated in Arena. Returns an empty ArrayRef only for empty
+/// or whitespace-only input; plain text with no Markdown constructs returns a
+/// single TextNode.
 ///
 /// The caller must keep Arena alive for the lifetime of any returned nodes.
 llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 207ae938c299a..e2fd07159d446 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -196,16 +196,13 @@ foo
   ASSERT_EQ(N->Lines.size(), 1u);
 }
 
-// CommonMark §4.5 example 124: closing fence must be at least as long as the
-// opening fence.
-// TODO: our parser currently closes on the first line with 3 matching fence
-// chars regardless of opening fence length. Fix as part of the CommonMark
-// TODO in parseMarkdown().
+// CommonMark §4.5 example 124: the closing fence must be at least as long as
+// the opening fence. Our parser closes on the first line with 3 matching fence
+// chars regardless of opening length, so this documents the current
+// non-conformant behavior.
+// TODO: fix as part of the CommonMark TODO in parseMarkdown().
 TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
   auto Nodes = parseMarkdown("````\naaa\n```", Arena);
-  // The ``` line should not close the ```` fence per CommonMark, but our
-  // parser currently treats it as a closing fence. This test documents the
-  // current (non-conformant) behavior.
   ASSERT_EQ(Nodes.size(), 1u);
   auto *N = cast<FencedCodeNode>(Nodes[0]);
   ASSERT_EQ(N->Lines.size(), 1u);

>From 14f455ecde0305ec38e20ca6068b0d8f5f259776 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:07:02 -0400
Subject: [PATCH 13/27] [clang-doc] Wrap plain-text paragraph lines in
 ParagraphNode

---
 .../clang-doc/support/Markdown.cpp            |  7 +-
 .../clang-doc/MarkdownParserTest.cpp          | 67 +++++++++++++------
 2 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index be2800bff5df7..59e651d2b8b05 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -286,9 +286,10 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       continue;
     }
 
-    // Plain text, scanned for inline constructs (emphasis, strong, code).
-    for (MDNode *Inline : parseInline(Line, Arena, Saver))
-      Nodes.push_back(Inline);
+    // Plain text line: scan for inline constructs (emphasis, strong, code) and
+    // wrap the result in a paragraph.
+    auto Inlines = parseInline(Line, Arena, Saver);
+    Nodes.push_back(new (Arena) ParagraphNode(Inlines));
     Reader.advance();
   }
 
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index e2fd07159d446..63d978061b99b 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -33,8 +33,9 @@ TEST_F(MarkdownParserTest, WhitespaceOnlyInput) {
 TEST_F(MarkdownParserTest, PlainText) {
   auto Nodes = parseMarkdown("hello world", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  auto *N = cast<TextNode>(Nodes[0]);
-  EXPECT_EQ(N->Text, "hello world");
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "hello world");
 }
 
 TEST_F(MarkdownParserTest, FencedCodeBlock) {
@@ -110,7 +111,7 @@ code
 - item)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 3u);
-  EXPECT_TRUE(isa<TextNode>(Nodes[0]));
+  EXPECT_TRUE(isa<ParagraphNode>(Nodes[0]));
   EXPECT_TRUE(isa<FencedCodeNode>(Nodes[1]));
   EXPECT_TRUE(isa<UnorderedListNode>(Nodes[2]));
 }
@@ -210,18 +211,22 @@ TEST_F(MarkdownParserTest, ClosingFenceLengthTODO) {
 
 TEST_F(MarkdownParserTest, EmphasisAsterisk) {
   auto Nodes = parseMarkdown("an *important* word", Arena);
-  ASSERT_EQ(Nodes.size(), 3u);
-  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "an ");
-  auto *Em = cast<EmphasisNode>(Nodes[1]);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "an ");
+  auto *Em = cast<EmphasisNode>(P->Children[1]);
   ASSERT_EQ(Em->Children.size(), 1u);
   EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
-  EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " word");
+  EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " word");
 }
 
 TEST_F(MarkdownParserTest, EmphasisUnderscore) {
   auto Nodes = parseMarkdown("_em_", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  auto *Em = cast<EmphasisNode>(Nodes[0]);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  auto *Em = cast<EmphasisNode>(P->Children[0]);
   ASSERT_EQ(Em->Children.size(), 1u);
   EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "em");
 }
@@ -229,7 +234,9 @@ TEST_F(MarkdownParserTest, EmphasisUnderscore) {
 TEST_F(MarkdownParserTest, StrongAsterisk) {
   auto Nodes = parseMarkdown("**bold**", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  auto *St = cast<StrongNode>(Nodes[0]);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  auto *St = cast<StrongNode>(P->Children[0]);
   ASSERT_EQ(St->Children.size(), 1u);
   EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
 }
@@ -237,7 +244,9 @@ TEST_F(MarkdownParserTest, StrongAsterisk) {
 TEST_F(MarkdownParserTest, StrongUnderscore) {
   auto Nodes = parseMarkdown("__bold__", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  auto *St = cast<StrongNode>(Nodes[0]);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  auto *St = cast<StrongNode>(P->Children[0]);
   ASSERT_EQ(St->Children.size(), 1u);
   EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bold");
 }
@@ -246,15 +255,19 @@ TEST_F(MarkdownParserTest, StrongUnderscore) {
 TEST_F(MarkdownParserTest, StrongBindsBeforeEmphasis) {
   auto Nodes = parseMarkdown("**strong**", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_TRUE(isa<StrongNode>(Nodes[0]));
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_TRUE(isa<StrongNode>(P->Children[0]));
 }
 
 TEST_F(MarkdownParserTest, InlineCode) {
   auto Nodes = parseMarkdown("call `foo()` here", Arena);
-  ASSERT_EQ(Nodes.size(), 3u);
-  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "call ");
-  EXPECT_EQ(cast<InlineCodeNode>(Nodes[1])->Code, "foo()");
-  EXPECT_EQ(cast<TextNode>(Nodes[2])->Text, " here");
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "call ");
+  EXPECT_EQ(cast<InlineCodeNode>(P->Children[1])->Code, "foo()");
+  EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " here");
 }
 
 // CommonMark §6.1: a doubled backtick fence lets the span contain a single
@@ -262,14 +275,18 @@ TEST_F(MarkdownParserTest, InlineCode) {
 TEST_F(MarkdownParserTest, InlineCodeDoubleBacktick) {
   auto Nodes = parseMarkdown("``a`b``", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(cast<InlineCodeNode>(Nodes[0])->Code, "a`b");
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "a`b");
 }
 
 // Emphasis and strong recurse, so a code span inside emphasis is parsed.
 TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
   auto Nodes = parseMarkdown("*see `x`*", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  auto *Em = cast<EmphasisNode>(Nodes[0]);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  auto *Em = cast<EmphasisNode>(P->Children[0]);
   ASSERT_EQ(Em->Children.size(), 2u);
   EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "see ");
   EXPECT_EQ(cast<InlineCodeNode>(Em->Children[1])->Code, "x");
@@ -278,7 +295,9 @@ TEST_F(MarkdownParserTest, CodeSpanInsideEmphasis) {
 TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
   auto Nodes = parseMarkdown("**a `b`**", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  auto *St = cast<StrongNode>(Nodes[0]);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  auto *St = cast<StrongNode>(P->Children[0]);
   ASSERT_EQ(St->Children.size(), 2u);
   EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "a ");
   EXPECT_EQ(cast<InlineCodeNode>(St->Children[1])->Code, "b");
@@ -288,21 +307,27 @@ TEST_F(MarkdownParserTest, CodeSpanInsideStrong) {
 TEST_F(MarkdownParserTest, UnmatchedDelimiterIsText) {
   auto Nodes = parseMarkdown("a * b", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a * b");
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "a * b");
 }
 
 // An unterminated code span leaves the backtick as literal text.
 TEST_F(MarkdownParserTest, UnterminatedCodeSpanIsText) {
   auto Nodes = parseMarkdown("a `b c", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "a `b c");
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "a `b c");
 }
 
 // Inline parsing must not disturb plain text with no markers.
 TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
   auto Nodes = parseMarkdown("just words", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_EQ(cast<TextNode>(Nodes[0])->Text, "just words");
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "just words");
 }
 
 } // namespace

>From 7bb303ad25d10ba9540af4ee38f3aac0582d49df Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:18:11 -0400
Subject: [PATCH 14/27] [clang-doc] Add CharReader cursor for character-level
 inline scanning

---
 .../clang-doc/support/Markdown.cpp            | 75 +++++++++++++++----
 1 file changed, 60 insertions(+), 15 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 59e651d2b8b05..1eb6ad51eaf02 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -82,6 +82,49 @@ class LineReader {
   size_t Pos = 0;
 };
 
+// A forward cursor over the characters of a string. The character-level analog
+// of LineReader: the inline scanner inspects the current or an upcoming
+// character and consumes characters without manual index arithmetic. position()
+// and seek() let it interoperate with the index-based run and delimiter helpers
+// below, since inline constructs are not consumed one character at a time.
+class CharReader {
+public:
+  explicit CharReader(StringRef S) : S(S) {}
+
+  // True once every character has been consumed.
+  bool atEnd() const { return Pos >= S.size(); }
+
+  // The current character. Must not be called when atEnd().
+  char peek() const {
+    assert(!atEnd() && "peek past end of input");
+    return S[Pos];
+  }
+
+  // The character Offset positions ahead of the cursor, or '\0' when that
+  // position is past the end. peek(0) is the current character.
+  char peek(size_t Offset) const {
+    size_t Target = Pos + Offset;
+    return Target < S.size() ? S[Target] : '\0';
+  }
+
+  // Consume the current character and return it. Must not be called when
+  // atEnd().
+  char advance() {
+    assert(!atEnd() && "advance past end of input");
+    return S[Pos++];
+  }
+
+  // The current scan position, for substring, run, and delimiter computations.
+  size_t position() const { return Pos; }
+
+  // Move the cursor to an absolute position, used to skip past a matched span.
+  void seek(size_t NewPos) { Pos = NewPos; }
+
+private:
+  StringRef S;
+  size_t Pos = 0;
+};
+
 // Returns the number of consecutive copies of C starting at S[Start].
 static size_t countRun(StringRef S, size_t Start, char C) {
   size_t I = Start;
@@ -137,7 +180,8 @@ static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
 static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
                                       StringSaver &Saver) {
   SmallVector<MDNode *> Nodes;
-  size_t TextStart = 0, Pos = 0, E = S.size();
+  CharReader Reader(S);
+  size_t TextStart = 0;
 
   auto flushText = [&](size_t End) {
     if (End > TextStart)
@@ -145,42 +189,43 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
           Saver.save(S.substr(TextStart, End - TextStart))));
   };
 
-  while (Pos < E) {
-    char C = S[Pos];
+  while (!Reader.atEnd()) {
+    size_t Pos = Reader.position();
+    char C = Reader.peek();
 
     // Inline code span: an opening backtick run closed by a run of the same
     // length.
     if (C == '`') {
       size_t OpenLen = countRun(S, Pos, '`');
       size_t ClosePos = Pos + OpenLen;
-      while (ClosePos < E && countRun(S, ClosePos, '`') != OpenLen)
+      while (ClosePos < S.size() && countRun(S, ClosePos, '`') != OpenLen)
         ClosePos += S[ClosePos] == '`' ? countRun(S, ClosePos, '`') : 1;
-      if (ClosePos < E) {
+      if (ClosePos < S.size()) {
         flushText(Pos);
         StringRef Code =
             trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
         Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
-        Pos = ClosePos + OpenLen;
-        TextStart = Pos;
+        Reader.seek(ClosePos + OpenLen);
+        TextStart = Reader.position();
         continue;
       }
       // No closing run; leave the backticks as literal text.
-      Pos += OpenLen;
+      Reader.seek(Pos + OpenLen);
       continue;
     }
 
     // Emphasis (*text*, _text_) and strong (**text**, __text__).
     if (C == '*' || C == '_') {
       // Strong binds the two-delimiter form before single-delimiter emphasis.
-      if (Pos + 1 < E && S[Pos + 1] == C) {
+      if (Reader.peek(1) == C) {
         size_t Close = findClosingDelim(S, Pos + 2, C, 2);
         if (Close != StringRef::npos) {
           flushText(Pos);
           StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
           Nodes.push_back(new (Arena)
                               StrongNode(parseInline(Inner, Arena, Saver)));
-          Pos = Close + 2;
-          TextStart = Pos;
+          Reader.seek(Close + 2);
+          TextStart = Reader.position();
           continue;
         }
       }
@@ -190,16 +235,16 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
         StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
         Nodes.push_back(new (Arena)
                             EmphasisNode(parseInline(Inner, Arena, Saver)));
-        Pos = Close + 1;
-        TextStart = Pos;
+        Reader.seek(Close + 1);
+        TextStart = Reader.position();
         continue;
       }
     }
 
-    ++Pos;
+    Reader.advance();
   }
 
-  flushText(E);
+  flushText(S.size());
   return allocateArray(Nodes, Arena);
 }
 

>From 6864c7b552b37e64ee69c4660517da2cf2c22975 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:22:06 -0400
Subject: [PATCH 15/27] [clang-doc] Extract block parse bodies into separate
 functions

---
 .../clang-doc/support/Markdown.cpp            | 122 +++++++++++-------
 1 file changed, 73 insertions(+), 49 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 1eb6ad51eaf02..625b3e6305ab9 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -248,6 +248,75 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
   return allocateArray(Nodes, Arena);
 }
 
+// Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
+// opening fence; the fence, body lines, and closing fence are consumed.
+//
+// TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+// indented up to 3 spaces, the closing fence must use the same character and be
+// at least as long as the opening fence, and the closing fence may only be
+// followed by spaces. Doxygen specifics should be handled on a case-by-case
+// basis.
+static FencedCodeNode *parseFencedCode(LineReader &Reader,
+                                       BumpPtrAllocator &Arena,
+                                       StringSaver &Saver) {
+  StringRef Open = Reader.peek().trim();
+  char Fence = Open[0];
+  StringRef Lang = Saver.save(Open.drop_front(3).trim());
+  Reader.advance(); // consume opening fence
+  SmallVector<StringRef> CodeLines;
+  while (!Reader.atEnd()) {
+    StringRef CodeLine = Reader.peek().trim();
+    if (CodeLine.size() >= 3 &&
+        all_of(CodeLine.take_front(3), [Fence](char C) { return C == Fence; }))
+      break;
+    CodeLines.push_back(Saver.save(Reader.advance()));
+  }
+  if (!Reader.atEnd())
+    Reader.advance(); // consume closing fence
+  auto *Code =
+      new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
+  LDBG() << "emitting FencedCodeNode lang='" << Lang
+         << "' lines=" << CodeLines.size();
+  return Code;
+}
+
+// Parses a pipe table. The cursor must be on the header row, with a separator
+// row following; consecutive lines containing a | are taken as rows.
+static TableNode *parsePipeTable(LineReader &Reader, BumpPtrAllocator &Arena,
+                                 StringSaver &Saver) {
+  SmallVector<StringRef> Rows;
+  // TODO: Rows are kept as raw line text for now. Table cells may contain
+  // inline content (emphasis, code spans, links), so each row may need to be
+  // split on '|' and parsed further into structured cells.
+  while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
+    Rows.push_back(Saver.save(Reader.advance().trim()));
+  auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
+  LDBG() << "emitting TableNode rows=" << Rows.size();
+  return Table;
+}
+
+// Parses an unordered (bullet) list. The cursor must be on the first item;
+// consecutive bullet lines are consumed into list items.
+static UnorderedListNode *parseUnorderedList(LineReader &Reader,
+                                             BumpPtrAllocator &Arena,
+                                             StringSaver &Saver) {
+  SmallVector<ListItemNode *> Items;
+  while (!Reader.atEnd()) {
+    StringRef L = Reader.peek().trim();
+    if (!isListItem(L))
+      break;
+    StringRef ItemText = Saver.save(L.drop_front(2).trim());
+    SmallVector<MDNode *> ItemChildren;
+    ItemChildren.push_back(new (Arena) TextNode(ItemText));
+    auto *Item = new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
+    Items.push_back(Item);
+    Reader.advance();
+  }
+  auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
+  LDBG() << "emitting UnorderedListNode items=" << Items.size();
+  return List;
+}
+
 ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
                                  BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
@@ -268,66 +337,21 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       continue;
     }
 
-    // TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
-    // indented up to 3 spaces, the closing fence must use the same character
-    // and be at least as long as the opening fence, and the closing fence may
-    // only be followed by spaces. Doxygen specifics should be handled on a
-    // case-by-case basis.
+    // Fenced code block.
     if (Line.starts_with("```") || Line.starts_with("~~~")) {
-      char Fence = Line[0];
-      StringRef Lang = Saver.save(Line.drop_front(3).trim());
-      Reader.advance(); // consume opening fence
-      SmallVector<StringRef> CodeLines;
-      while (!Reader.atEnd()) {
-        StringRef CodeLine = Reader.peek().trim();
-        if (CodeLine.size() >= 3 &&
-            all_of(CodeLine.take_front(3),
-                   [Fence](char C) { return C == Fence; }))
-          break;
-        CodeLines.push_back(Saver.save(Reader.advance()));
-      }
-      if (!Reader.atEnd())
-        Reader.advance(); // consume closing fence
-      auto *Code =
-          new (Arena) FencedCodeNode(Lang, allocateArray(CodeLines, Arena));
-      LDBG() << "emitting FencedCodeNode lang='" << Lang
-             << "' lines=" << CodeLines.size();
-      Nodes.push_back(Code);
+      Nodes.push_back(parseFencedCode(Reader, Arena, Saver));
       continue;
     }
 
     // Pipe table: current line has | and next line is a separator row.
     if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
-      SmallVector<StringRef> Rows;
-      // TODO: Rows are kept as raw line text for now. Table cells may contain
-      // inline content (emphasis, code spans, links), so each row may need to
-      // be split on '|' and parsed further into structured cells.
-      while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
-        Rows.push_back(Saver.save(Reader.advance().trim()));
-      auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
-      LDBG() << "emitting TableNode rows=" << Rows.size();
-      Nodes.push_back(Table);
+      Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
       continue;
     }
 
     // Unordered list item.
     if (isListItem(Line)) {
-      SmallVector<ListItemNode *> Items;
-      while (!Reader.atEnd()) {
-        StringRef L = Reader.peek().trim();
-        if (!isListItem(L))
-          break;
-        StringRef ItemText = Saver.save(L.drop_front(2).trim());
-        SmallVector<MDNode *> ItemChildren;
-        ItemChildren.push_back(new (Arena) TextNode(ItemText));
-        auto *Item =
-            new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
-        Items.push_back(Item);
-        Reader.advance();
-      }
-      auto *List = new (Arena) UnorderedListNode(allocateArray(Items, Arena));
-      LDBG() << "emitting UnorderedListNode items=" << Items.size();
-      Nodes.push_back(List);
+      Nodes.push_back(parseUnorderedList(Reader, Arena, Saver));
       continue;
     }
 

>From 86e45d603fefc440f3334516b5f4fcfd69354d7a Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:27:51 -0400
Subject: [PATCH 16/27] [clang-doc] Add ATX heading parsing with inline content
 support

---
 .../clang-doc/support/Markdown.cpp            | 34 +++++++++++
 .../clang-doc/MarkdownParserTest.cpp          | 59 +++++++++++++++++++
 2 files changed, 93 insertions(+)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 625b3e6305ab9..d59d95586e836 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -46,6 +46,16 @@ static bool isListItem(StringRef Line) {
          Line.starts_with("+ ");
 }
 
+// Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
+// six leading # characters followed by a space. Returns 0 otherwise, so seven
+// or more # characters fall back to plain text.
+static unsigned atxHeadingLevel(StringRef Line) {
+  size_t Level = Line.find_first_not_of('#');
+  if (Level == StringRef::npos || Level < 1 || Level > 6 || Line[Level] != ' ')
+    return 0;
+  return Level;
+}
+
 // A forward cursor over the lines of a paragraph. Encapsulates the parse
 // position so the loop can inspect the current or an upcoming line and consume
 // lines without manual index arithmetic. Lines are stored untrimmed; callers
@@ -317,6 +327,24 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
   return List;
 }
 
+// Parses an ATX heading: one to six leading # characters and a space, followed
+// by inline content. The cursor must be on the heading line, which is consumed.
+//
+// TODO: CommonMark §4.2 also allows up to 3 leading spaces and an optional
+// closing run of # characters; neither is handled yet.
+static HeadingNode *parseHeading(LineReader &Reader, BumpPtrAllocator &Arena,
+                                 StringSaver &Saver) {
+  StringRef Line = Reader.peek().trim();
+  unsigned Level = atxHeadingLevel(Line);
+  assert(Level >= 1 && Level <= 6 && "parseHeading called on a non-heading");
+  StringRef Content = Line.drop_front(Level).trim();
+  Reader.advance();
+  auto *Heading =
+      new (Arena) HeadingNode(Level, parseInline(Content, Arena, Saver));
+  LDBG() << "emitting HeadingNode level=" << Level;
+  return Heading;
+}
+
 ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
                                  BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
@@ -343,6 +371,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       continue;
     }
 
+    // ATX heading: 1 to 6 leading # characters and a space.
+    if (atxHeadingLevel(Line)) {
+      Nodes.push_back(parseHeading(Reader, Arena, Saver));
+      continue;
+    }
+
     // Pipe table: current line has | and next line is a separator row.
     if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
       Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 63d978061b99b..c48b7a463c3a0 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -330,4 +330,63 @@ TEST_F(MarkdownParserTest, PlainTextHasNoInlineNodes) {
   EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "just words");
 }
 
+TEST_F(MarkdownParserTest, Heading1) {
+  auto Nodes = parseMarkdown("# Title", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *H = cast<HeadingNode>(Nodes[0]);
+  EXPECT_EQ(H->Level, 1u);
+  ASSERT_EQ(H->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, Heading2) {
+  auto Nodes = parseMarkdown("## Title", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *H = cast<HeadingNode>(Nodes[0]);
+  EXPECT_EQ(H->Level, 2u);
+  ASSERT_EQ(H->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, Heading3) {
+  auto Nodes = parseMarkdown("### Title", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *H = cast<HeadingNode>(Nodes[0]);
+  EXPECT_EQ(H->Level, 3u);
+  ASSERT_EQ(H->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Title");
+}
+
+TEST_F(MarkdownParserTest, HeadingWithInlineCode) {
+  auto Nodes = parseMarkdown("# Use `foo()`", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *H = cast<HeadingNode>(Nodes[0]);
+  EXPECT_EQ(H->Level, 1u);
+  ASSERT_EQ(H->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "Use ");
+  EXPECT_EQ(cast<InlineCodeNode>(H->Children[1])->Code, "foo()");
+}
+
+TEST_F(MarkdownParserTest, HeadingWithEmphasis) {
+  auto Nodes = parseMarkdown("## see *this*", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *H = cast<HeadingNode>(Nodes[0]);
+  EXPECT_EQ(H->Level, 2u);
+  ASSERT_EQ(H->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "see ");
+  auto *Em = cast<EmphasisNode>(H->Children[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "this");
+}
+
+// Seven or more # characters are not a valid ATX heading, so the line falls
+// back to a plain-text paragraph.
+TEST_F(MarkdownParserTest, SevenHashesIsPlainText) {
+  auto Nodes = parseMarkdown("####### too many", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "####### too many");
+}
+
 } // namespace

>From 2b14505cadb016131f53cbc3200973c3cee6ae04 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:32:14 -0400
Subject: [PATCH 17/27] [clang-doc] Run list item text through parseInline for
 inline markup support

---
 .../clang-doc/support/Markdown.cpp            |  6 ++---
 .../clang-doc/MarkdownParserTest.cpp          | 24 ++++++++++++++++---
 2 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index d59d95586e836..6901f6c2f40a5 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -315,10 +315,8 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
     StringRef L = Reader.peek().trim();
     if (!isListItem(L))
       break;
-    StringRef ItemText = Saver.save(L.drop_front(2).trim());
-    SmallVector<MDNode *> ItemChildren;
-    ItemChildren.push_back(new (Arena) TextNode(ItemText));
-    auto *Item = new (Arena) ListItemNode(allocateArray(ItemChildren, Arena));
+    StringRef ItemText = L.drop_front(2).trim();
+    auto *Item = new (Arena) ListItemNode(parseInline(ItemText, Arena, Saver));
     Items.push_back(Item);
     Reader.advance();
   }
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index c48b7a463c3a0..9a7d6d1fd0942 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -98,9 +98,27 @@ TEST_F(MarkdownParserTest, UnorderedList) {
   ASSERT_EQ(Nodes.size(), 1u);
   auto *N = cast<UnorderedListNode>(Nodes[0]);
   ASSERT_EQ(N->Items.size(), 3u);
-  EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
-  EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "bar");
-  EXPECT_EQ(cast<TextNode>(N->Items[2]->Children[0])->Text, "baz");
+  // Each item's children are the inline nodes from parseInline.
+  StringRef ExpectedText[] = {"foo", "bar", "baz"};
+  for (size_t I = 0; I < N->Items.size(); ++I) {
+    auto *Item = N->Items[I];
+    ASSERT_EQ(Item->Children.size(), 1u);
+    EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, ExpectedText[I]);
+  }
+}
+
+TEST_F(MarkdownParserTest, ListItemWithEmphasis) {
+  auto Nodes = parseMarkdown("- an *important* note", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *N = cast<UnorderedListNode>(Nodes[0]);
+  ASSERT_EQ(N->Items.size(), 1u);
+  auto *Item = N->Items[0];
+  ASSERT_EQ(Item->Children.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, "an ");
+  auto *Em = cast<EmphasisNode>(Item->Children[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+  EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
 }
 
 TEST_F(MarkdownParserTest, MixedContent) {

>From aaf4b6e2b1600bce25f625abb2caf9ad25b52f90 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:42:27 -0400
Subject: [PATCH 18/27] [clang-doc] Add ordered list parsing with inline
 content support

---
 .../clang-doc/support/Markdown.cpp            | 40 +++++++++++++++++
 .../clang-doc/MarkdownParserTest.cpp          | 44 +++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 6901f6c2f40a5..211fb0407578f 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -46,6 +46,14 @@ static bool isListItem(StringRef Line) {
          Line.starts_with("+ ");
 }
 
+// Returns true if Line begins with an ordered list marker: one or more digits
+// followed by a period and a space (e.g. "1. ", "42. ").
+static bool isOrderedListItem(StringRef Line) {
+  size_t Dot = Line.find_first_not_of("0123456789");
+  return Dot != StringRef::npos && Dot > 0 && Line[Dot] == '.' &&
+         Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
+}
+
 // Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
 // six leading # characters followed by a space. Returns 0 otherwise, so seven
 // or more # characters fall back to plain text.
@@ -325,6 +333,32 @@ static UnorderedListNode *parseUnorderedList(LineReader &Reader,
   return List;
 }
 
+// Parses an ordered (numbered) list. The cursor must be on the first item; the
+// start number is taken from that item's marker and consecutive numbered lines
+// are consumed. Item numbers after the first are not validated.
+static OrderedListNode *parseOrderedList(LineReader &Reader,
+                                         BumpPtrAllocator &Arena,
+                                         StringSaver &Saver) {
+  unsigned Start = 0;
+  Reader.peek().trim().take_while(isDigit).getAsInteger(10, Start);
+  SmallVector<ListItemNode *> Items;
+  while (!Reader.atEnd()) {
+    StringRef L = Reader.peek().trim();
+    if (!isOrderedListItem(L))
+      break;
+    // Drop the "<digits>. " marker: the digits, the period, and the space.
+    StringRef ItemText =
+        L.drop_front(L.find_first_not_of("0123456789") + 2).trim();
+    auto *Item = new (Arena) ListItemNode(parseInline(ItemText, Arena, Saver));
+    Items.push_back(Item);
+    Reader.advance();
+  }
+  auto *List = new (Arena) OrderedListNode(Start, allocateArray(Items, Arena));
+  LDBG() << "emitting OrderedListNode start=" << Start
+         << " items=" << Items.size();
+  return List;
+}
+
 // Parses an ATX heading: one to six leading # characters and a space, followed
 // by inline content. The cursor must be on the heading line, which is consumed.
 //
@@ -387,6 +421,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       continue;
     }
 
+    // Ordered list item: digits followed by a period and a space.
+    if (isOrderedListItem(Line)) {
+      Nodes.push_back(parseOrderedList(Reader, Arena, Saver));
+      continue;
+    }
+
     // Plain text line: scan for inline constructs (emphasis, strong, code) and
     // wrap the result in a paragraph.
     auto Inlines = parseInline(Line, Arena, Saver);
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 9a7d6d1fd0942..a0ba39c163a34 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -121,6 +121,50 @@ TEST_F(MarkdownParserTest, ListItemWithEmphasis) {
   EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
 }
 
+TEST_F(MarkdownParserTest, OrderedList) {
+  auto Nodes = parseMarkdown(R"(1. foo
+2. bar
+3. baz)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *N = cast<OrderedListNode>(Nodes[0]);
+  EXPECT_EQ(N->Start, 1u);
+  ASSERT_EQ(N->Items.size(), 3u);
+  StringRef ExpectedText[] = {"foo", "bar", "baz"};
+  for (size_t I = 0; I < N->Items.size(); ++I) {
+    auto *Item = N->Items[I];
+    ASSERT_EQ(Item->Children.size(), 1u);
+    EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, ExpectedText[I]);
+  }
+}
+
+TEST_F(MarkdownParserTest, OrderedListCustomStart) {
+  auto Nodes = parseMarkdown(R"(5. five
+6. six)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *N = cast<OrderedListNode>(Nodes[0]);
+  EXPECT_EQ(N->Start, 5u);
+  ASSERT_EQ(N->Items.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "five");
+  EXPECT_EQ(cast<TextNode>(N->Items[1]->Children[0])->Text, "six");
+}
+
+TEST_F(MarkdownParserTest, OrderedListItemWithEmphasis) {
+  auto Nodes = parseMarkdown("1. an *important* note", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *N = cast<OrderedListNode>(Nodes[0]);
+  EXPECT_EQ(N->Start, 1u);
+  ASSERT_EQ(N->Items.size(), 1u);
+  auto *Item = N->Items[0];
+  ASSERT_EQ(Item->Children.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(Item->Children[0])->Text, "an ");
+  auto *Em = cast<EmphasisNode>(Item->Children[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+  EXPECT_EQ(cast<TextNode>(Item->Children[2])->Text, " note");
+}
+
 TEST_F(MarkdownParserTest, MixedContent) {
   auto Nodes = parseMarkdown(R"(some text
 ```````

>From 2ce9a89495e81eb5f0c67551f114e08eadefdabd Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:47:42 -0400
Subject: [PATCH 19/27] [clang-doc] Add thematic break parsing

---
 .../clang-doc/support/Markdown.cpp            | 26 +++++++++++++++++++
 .../clang-doc/MarkdownParserTest.cpp          | 18 +++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 211fb0407578f..2f0cc5bffe566 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -54,6 +54,23 @@ static bool isOrderedListItem(StringRef Line) {
          Dot + 1 < Line.size() && Line[Dot + 1] == ' ';
 }
 
+// Returns true if Line is a thematic break: three or more matching -, *, or _
+// characters, optionally separated by spaces, with nothing else. Line is
+// expected to be trimmed.
+static bool isThematicBreak(StringRef Line) {
+  char Marker = Line.empty() ? '\0' : Line[0];
+  if (Marker != '-' && Marker != '*' && Marker != '_')
+    return false;
+  unsigned Count = 0;
+  for (char C : Line) {
+    if (C == Marker)
+      ++Count;
+    else if (C != ' ')
+      return false;
+  }
+  return Count >= 3;
+}
+
 // Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
 // six leading # characters followed by a space. Returns 0 otherwise, so seven
 // or more # characters fall back to plain text.
@@ -409,6 +426,15 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       continue;
     }
 
+    // Thematic break: 3 or more matching -, *, or _ characters. Checked before
+    // the list cases so that "* * *" and "- - -" are breaks, not list items.
+    if (isThematicBreak(Line)) {
+      Reader.advance();
+      Nodes.push_back(new (Arena) ThematicBreakNode());
+      LDBG() << "emitting ThematicBreakNode";
+      continue;
+    }
+
     // Pipe table: current line has | and next line is a separator row.
     if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
       Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index a0ba39c163a34..188d1987ac06d 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -451,4 +451,22 @@ TEST_F(MarkdownParserTest, SevenHashesIsPlainText) {
   EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "####### too many");
 }
 
+TEST_F(MarkdownParserTest, ThematicBreakDashes) {
+  auto Nodes = parseMarkdown("---", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, ThematicBreakAsterisks) {
+  auto Nodes = parseMarkdown("***", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+TEST_F(MarkdownParserTest, ThematicBreakUnderscores) {
+  auto Nodes = parseMarkdown("___", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
 } // namespace

>From 843d0554dd1ad93c139d69f09fdc06800df7b078 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Fri, 12 Jun 2026 23:58:15 -0400
Subject: [PATCH 20/27] [clang-doc] Add CommonMark spec edge case tests with
 section citations

---
 .../clang-doc/MarkdownParserTest.cpp          | 145 ++++++++++++++++++
 1 file changed, 145 insertions(+)

diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 188d1987ac06d..350b15c2541ed 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -469,4 +469,149 @@ TEST_F(MarkdownParserTest, ThematicBreakUnderscores) {
   EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
 }
 
+//===----------------------------------------------------------------------===//
+// CommonMark spec edge cases (spec.commonmark.org/0.31.2). Each test cites the
+// section and example it exercises. Cases marked DIVERGENCE document where this
+// simplified parser intentionally differs from full CommonMark.
+//===----------------------------------------------------------------------===//
+
+// CommonMark §4.1 Example 51: spaces are allowed between the characters.
+TEST_F(MarkdownParserTest, ThematicBreakSpacedDashes) {
+  auto Nodes = parseMarkdown("- - -", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  EXPECT_TRUE(isa<ThematicBreakNode>(Nodes[0]));
+}
+
+// CommonMark §4.1 Example 44: +++ is not a thematic break.
+TEST_F(MarkdownParserTest, PlusesAreNotThematicBreak) {
+  auto Nodes = parseMarkdown("+++", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "+++");
+}
+
+// CommonMark §4.1 Example 46: fewer than three characters is not a break.
+TEST_F(MarkdownParserTest, TwoDashesAreNotThematicBreak) {
+  auto Nodes = parseMarkdown("--", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "--");
+}
+
+// CommonMark §4.2 Example 64: a # not followed by a space is not a heading.
+TEST_F(MarkdownParserTest, HashWithoutSpaceIsNotHeading) {
+  auto Nodes = parseMarkdown("#5 bolt", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "#5 bolt");
+}
+
+// CommonMark §4.2 Example 64: "#hashtag" is a paragraph, not a heading.
+TEST_F(MarkdownParserTest, HashtagIsNotHeading) {
+  auto Nodes = parseMarkdown("#hashtag", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "#hashtag");
+}
+
+// CommonMark §4.2 Example 67: spaces around the heading content are stripped.
+TEST_F(MarkdownParserTest, HeadingStripsContentSpaces) {
+  auto Nodes = parseMarkdown("#         foo", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *H = cast<HeadingNode>(Nodes[0]);
+  EXPECT_EQ(H->Level, 1u);
+  ASSERT_EQ(H->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(H->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2: * is a valid bullet list marker.
+TEST_F(MarkdownParserTest, UnorderedListAsteriskMarker) {
+  auto Nodes = parseMarkdown("* foo", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *N = cast<UnorderedListNode>(Nodes[0]);
+  ASSERT_EQ(N->Items.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2 Example 301: + is a valid bullet list marker.
+TEST_F(MarkdownParserTest, UnorderedListPlusMarker) {
+  auto Nodes = parseMarkdown("+ foo", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *N = cast<UnorderedListNode>(Nodes[0]);
+  ASSERT_EQ(N->Items.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "foo");
+}
+
+// CommonMark §5.2 Example 267: an ordered list may start at 0.
+TEST_F(MarkdownParserTest, OrderedListStartZero) {
+  auto Nodes = parseMarkdown("0. ok", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *N = cast<OrderedListNode>(Nodes[0]);
+  EXPECT_EQ(N->Start, 0u);
+  ASSERT_EQ(N->Items.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(N->Items[0]->Children[0])->Text, "ok");
+}
+
+// CommonMark §5.2 Example 296: ordered lists may use a ) delimiter. DIVERGENCE:
+// this parser only recognizes the . delimiter, so "1) foo" is plain text.
+TEST_F(MarkdownParserTest, OrderedListParenDelimiterNotSupported) {
+  auto Nodes = parseMarkdown("1) foo", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "1) foo");
+}
+
+// CommonMark §6.2 Example 355: intraword emphasis with asterisks.
+TEST_F(MarkdownParserTest, IntrawordEmphasisAsterisk) {
+  auto Nodes = parseMarkdown("foo*bar*", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+  auto *Em = cast<EmphasisNode>(P->Children[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.2 Example 381: intraword strong with asterisks.
+TEST_F(MarkdownParserTest, IntrawordStrongAsterisk) {
+  auto Nodes = parseMarkdown("foo**bar**", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+  auto *St = cast<StrongNode>(P->Children[1]);
+  ASSERT_EQ(St->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.2 Example 360: intraword underscores do NOT open emphasis, so
+// "foo_bar_" is literal text. DIVERGENCE: this parser lacks the intraword
+// underscore rule (see the findClosingDelim TODO) and treats it as emphasis.
+TEST_F(MarkdownParserTest, IntrawordUnderscoreEmphasisDivergence) {
+  auto Nodes = parseMarkdown("foo_bar_", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
+  auto *Em = cast<EmphasisNode>(P->Children[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+}
+
+// CommonMark §6.1 Example 331: a code span strips one leading and trailing
+// space when both are present.
+TEST_F(MarkdownParserTest, CodeSpanStripsSurroundingSpaces) {
+  auto Nodes = parseMarkdown("`` x ``", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "x");
+}
+
 } // namespace

>From e9e6b8d7b1509d36d5c93f604f31b3e4ad9a63ea Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 00:03:54 -0400
Subject: [PATCH 21/27] [clang-doc] Add block quote parsing with recursive
 inner parsing

---
 .../clang-doc/support/Markdown.cpp            | 38 ++++++++++++++
 .../clang-doc/MarkdownParserTest.cpp          | 50 +++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 2f0cc5bffe566..fdfc619e0ea05 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/DebugLog.h"
 #include "llvm/Support/StringSaver.h"
 #include <cassert>
+#include <string>
 
 #define DEBUG_TYPE "clang-doc"
 
@@ -71,6 +72,12 @@ static bool isThematicBreak(StringRef Line) {
   return Count >= 3;
 }
 
+// Returns true if Line is a block quote line: it starts with "> ", or is a bare
+// ">" marking an empty quote line.
+static bool isBlockQuote(StringRef Line) {
+  return Line.starts_with("> ") || Line == ">";
+}
+
 // Returns the ATX heading level (1 to 6) when Line is an ATX heading: one to
 // six leading # characters followed by a space. Returns 0 otherwise, so seven
 // or more # characters fall back to plain text.
@@ -394,6 +401,31 @@ static HeadingNode *parseHeading(LineReader &Reader, BumpPtrAllocator &Arena,
   return Heading;
 }
 
+// Parses a block quote: one or more consecutive lines beginning with "> ". The
+// > marker and one following space are stripped from each line, and the
+// collected text is parsed recursively, so a quote's children are block-level
+// nodes and nested quotes fall out naturally.
+static BlockQuoteNode *parseBlockQuote(LineReader &Reader,
+                                       BumpPtrAllocator &Arena) {
+  std::string Inner;
+  bool First = true;
+  while (!Reader.atEnd()) {
+    StringRef L = Reader.peek().trim();
+    if (!isBlockQuote(L))
+      break;
+    if (!First)
+      Inner += '\n';
+    First = false;
+    StringRef Content = L.starts_with("> ") ? L.drop_front(2) : L.drop_front(1);
+    Inner.append(Content.data(), Content.size());
+    Reader.advance();
+  }
+  ArrayRef<MDNode *> Children = parseMarkdown(Inner, Arena);
+  auto *Quote = new (Arena) BlockQuoteNode(Children);
+  LDBG() << "emitting BlockQuoteNode children=" << Children.size();
+  return Quote;
+}
+
 ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
                                  BumpPtrAllocator &Arena) {
   if (ParagraphText.trim().empty())
@@ -435,6 +467,12 @@ ArrayRef<MDNode *> parseMarkdown(StringRef ParagraphText,
       continue;
     }
 
+    // Block quote: consecutive lines beginning with "> ".
+    if (isBlockQuote(Line)) {
+      Nodes.push_back(parseBlockQuote(Reader, Arena));
+      continue;
+    }
+
     // Pipe table: current line has | and next line is a separator row.
     if (Line.contains('|') && isSepRow(Reader.peek(1).trim())) {
       Nodes.push_back(parsePipeTable(Reader, Arena, Saver));
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 350b15c2541ed..aedcd9407b197 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -614,4 +614,54 @@ TEST_F(MarkdownParserTest, CodeSpanStripsSurroundingSpaces) {
   EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "x");
 }
 
+TEST_F(MarkdownParserTest, BlockQuote) {
+  auto Nodes = parseMarkdown("> hello", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+  ASSERT_EQ(Q->Children.size(), 1u);
+  auto *P = cast<ParagraphNode>(Q->Children[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "hello");
+}
+
+TEST_F(MarkdownParserTest, BlockQuoteWithFencedCode) {
+  auto Nodes = parseMarkdown(R"(> ```cpp
+> int x = 0;
+> ```)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+  ASSERT_EQ(Q->Children.size(), 1u);
+  auto *Code = cast<FencedCodeNode>(Q->Children[0]);
+  EXPECT_EQ(Code->Lang, "cpp");
+  ASSERT_EQ(Code->Lines.size(), 1u);
+  EXPECT_EQ(Code->Lines[0], "int x = 0;");
+}
+
+TEST_F(MarkdownParserTest, BlockQuoteWithEmphasis) {
+  auto Nodes = parseMarkdown("> an *important* note", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *Q = cast<BlockQuoteNode>(Nodes[0]);
+  ASSERT_EQ(Q->Children.size(), 1u);
+  auto *P = cast<ParagraphNode>(Q->Children[0]);
+  ASSERT_EQ(P->Children.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "an ");
+  auto *Em = cast<EmphasisNode>(P->Children[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "important");
+  EXPECT_EQ(cast<TextNode>(P->Children[2])->Text, " note");
+}
+
+TEST_F(MarkdownParserTest, NestedBlockQuote) {
+  auto Nodes = parseMarkdown("> > deep", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *Outer = cast<BlockQuoteNode>(Nodes[0]);
+  ASSERT_EQ(Outer->Children.size(), 1u);
+  auto *Inner = cast<BlockQuoteNode>(Outer->Children[0]);
+  ASSERT_EQ(Inner->Children.size(), 1u);
+  auto *P = cast<ParagraphNode>(Inner->Children[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "deep");
+}
+
 } // namespace

>From 9325916d90af28470b9c7dd634b3c12471cc6dc5 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 12:22:13 -0400
Subject: [PATCH 22/27] [clang-doc] Address review feedback: review comments

---
 .../clang-doc/support/Markdown.cpp            |  2 +-
 .../clang-doc/support/Markdown.h              | 52 +++++++++----------
 2 files changed, 27 insertions(+), 27 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index fdfc619e0ea05..08277b1405e0b 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -293,7 +293,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
 // Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
 // opening fence; the fence, body lines, and closing fence are consumed.
 //
-// TODO: Follow CommonMark spec §4.5 more closely -- opening fences may be
+// TODO: Follow CommonMark spec §4.5 more closely. Opening fences may be
 // indented up to 3 spaces, the closing fence must use the same character and be
 // at least as long as the opening fence, and the closing fence may only be
 // followed by spaces. Doxygen specifics should be handled on a case-by-case
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 8c2055868671a..a9b00a5c10225 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -15,27 +15,27 @@
 /// llvm::isa<>/cast<>/dyn_cast<> to check what a node is.
 ///
 /// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
-///   TextNode       -- plain text run
-///   SoftBreakNode  -- soft line break
-///   HardBreakNode  -- hard line break (trailing spaces or backslash)
-///   InlineCodeNode -- inline code span (`code`)
-///   EmphasisNode   -- emphasis (*text* or _text_)
-///   StrongNode     -- strong emphasis (**text** or __text__)
+///   TextNode: plain text run
+///   SoftBreakNode: soft line break
+///   HardBreakNode: hard line break (trailing spaces or backslash)
+///   InlineCodeNode: inline code span (`code`)
+///   EmphasisNode: emphasis (*text* or _text_)
+///   StrongNode: strong emphasis (**text** or __text__)
 ///
 /// Block nodes:
-///   ParagraphNode     -- sequence of inline nodes
-///   HeadingNode       -- ATX heading (# through ######), level 1-6
-///   FencedCodeNode    -- fenced code block (``` or ~~~)
-///   TableNode         -- pipe table (raw row text; TODO: structured cells)
-///   UnorderedListNode -- bullet list (-, *, +)
-///   OrderedListNode   -- numbered list with explicit start number
-///   ListItemNode      -- single item inside a list
-///   BlockQuoteNode    -- block quote (>)
-///   ThematicBreakNode -- horizontal rule (---, ***, ___)
+///   ParagraphNode: sequence of inline nodes
+///   HeadingNode: ATX heading (# through ######), level 1-6
+///   FencedCodeNode: fenced code block (``` or ~~~)
+///   TableNode: pipe table (raw row text; TODO: structured cells)
+///   UnorderedListNode: bullet list (-, *, +)
+///   OrderedListNode: numbered list with explicit start number
+///   ListItemNode: single item inside a list
+///   BlockQuoteNode: block quote (>)
+///   ThematicBreakNode: horizontal rule (---, ***, ___)
 ///
 /// All nodes are arena-allocated. The caller owns the arena and must keep it
-/// alive for the lifetime of any returned nodes. The parser never crashes on
-/// malformed input; unrecognized text falls back to TextNode.
+/// alive for the lifetime of any returned nodes. Malformed input is parsed as
+/// plain text rather than rejected; unrecognized text falls back to TextNode.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -58,7 +58,7 @@ enum class NodeKind {
   NK_InlineCode,
   NK_Emphasis,
   NK_Strong,
-  NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+  NK_LastInline = NK_Strong, // sentinel: all inline kinds are <= this
 
   // Block nodes
   NK_Paragraph,
@@ -70,12 +70,12 @@ enum class NodeKind {
   NK_ListItem,
   NK_BlockQuote,
   NK_ThematicBreak,
-  NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
+  NK_FirstBlock = NK_Paragraph, // sentinel: all block kinds are >= this
 };
 
-/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
-/// Nodes are arena-allocated and have no virtual destructor; use
-/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
+/// Base type for all Markdown AST nodes. Nodes are arena-allocated and have no
+/// virtual destructor; use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting.
 struct MDNode {
   NodeKind Kind;
   explicit MDNode(NodeKind K) : Kind(K) {}
@@ -93,7 +93,7 @@ struct TextNode : MDNode {
   static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
 };
 
-/// Soft line break -- a newline that does not end the paragraph.
+/// Soft line break: a newline that does not end the paragraph.
 struct SoftBreakNode : MDNode {
   SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
   static bool classof(const MDNode *N) {
@@ -101,7 +101,7 @@ struct SoftBreakNode : MDNode {
   }
 };
 
-/// Hard line break -- two trailing spaces or a backslash before a newline.
+/// Hard line break: two trailing spaces or a backslash before a newline.
 struct HardBreakNode : MDNode {
   HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
   static bool classof(const MDNode *N) {
@@ -143,7 +143,7 @@ struct StrongNode : MDNode {
 // Block nodes
 //===----------------------------------------------------------------------===//
 
-/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// A paragraph: sequence of inline nodes separated from other blocks by
 /// blank lines.
 struct ParagraphNode : MDNode {
   llvm::ArrayRef<MDNode *> Children;
@@ -169,7 +169,7 @@ struct HeadingNode : MDNode {
 /// "cpp"); empty when no language was specified. Lines contains the raw text
 /// of each interior line, without the opening or closing fence.
 ///
-/// TODO: Follow CommonMark spec §4.5 -- the opening fence may be indented up
+/// TODO: Follow CommonMark spec §4.5. The opening fence may be indented up
 /// to 3 spaces; the closing fence must use the same character and be at least
 /// as long as the opening fence; only spaces may follow the closing fence.
 struct FencedCodeNode : MDNode {

>From 9061cd48f9ec27d72f252414c047626bc1add513 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 12:42:12 -0400
Subject: [PATCH 23/27] [clang-doc] Implement CommonMark delimiter stack for
 emphasis and strong parsing

---
 .../clang-doc/support/Markdown.cpp            | 256 ++++++++++++++----
 .../clang-doc/MarkdownParserTest.cpp          |  14 +-
 2 files changed, 202 insertions(+), 68 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 08277b1405e0b..9ce5339fc8cb6 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/DebugLog.h"
 #include "llvm/Support/StringSaver.h"
 #include <cassert>
@@ -184,50 +185,108 @@ static StringRef trimCodeSpan(StringRef Code) {
   return Code;
 }
 
-// Finds the start index of a closing emphasis run of exactly DelimLen copies of
-// DelimChar, searching forward from StartPos. Requires non-whitespace
-// immediately inside both the opening and closing delimiters and non-empty
-// content, a simplified take on the CommonMark §6.2 flanking rules. Returns
-// StringRef::npos if no valid closing run exists.
-static size_t findClosingDelim(StringRef S, size_t StartPos, char DelimChar,
-                               size_t DelimLen) {
-  size_t E = S.size();
-  // Opening delimiter is not left-flanking if whitespace follows it.
-  if (StartPos >= E || isSpace(S[StartPos]))
-    return StringRef::npos;
-  for (size_t J = StartPos; J + DelimLen <= E; ++J) {
-    if (S[J] != DelimChar)
-      continue;
-    size_t Run = countRun(S, J, DelimChar);
-    if (Run != DelimLen) {
-      J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
-      continue;
-    }
-    // Reject empty content and closing runs that are not right-flanking.
-    if (J == StartPos || isSpace(S[J - 1]))
-      continue;
-    return J;
+// Treats the start and end of the string (passed as '\0') as whitespace for the
+// CommonMark flanking rules.
+static bool isFlankWhitespace(char C) { return C == '\0' || isSpace(C); }
+
+// Computes whether a delimiter run can open or close emphasis, from the
+// characters immediately before and after the run, per the CommonMark §6.2
+// flanking rules. Before and After are '\0' at the string boundaries.
+static void computeFlanking(char Before, char Marker, char After, bool &CanOpen,
+                            bool &CanClose) {
+  bool AfterWS = isFlankWhitespace(After);
+  bool BeforeWS = isFlankWhitespace(Before);
+  bool AfterPunct = isPunct(After);
+  bool BeforePunct = isPunct(Before);
+  bool LeftFlanking = !AfterWS && (!AfterPunct || BeforeWS || BeforePunct);
+  bool RightFlanking = !BeforeWS && (!BeforePunct || AfterWS || AfterPunct);
+  if (Marker == '_') {
+    // Underscore does not open or close emphasis intraword.
+    CanOpen = LeftFlanking && (!RightFlanking || BeforePunct);
+    CanClose = RightFlanking && (!LeftFlanking || AfterPunct);
+  } else {
+    CanOpen = LeftFlanking;
+    CanClose = RightFlanking;
   }
-  return StringRef::npos;
 }
 
+namespace {
+// One piece of inline content while emphasis is being resolved. A piece is
+// either a finished content node (text, code span, or a built emphasis or
+// strong node) or a run of delimiter characters that may still open or close
+// emphasis. Pieces form a doubly linked list through Prev/Next so matched runs
+// can be spliced out without shifting the others.
+struct InlinePiece {
+  MDNode *Node = nullptr; // content node, or null while this is a delimiter run
+  char Ch = 0;            // '*' or '_' for a delimiter run
+  size_t Len = 0;         // delimiters still available in the run
+  unsigned OrigLen = 0;   // original run length, for the multiple-of-three rule
+  bool CanOpen = false;
+  bool CanClose = false;
+  int Prev = -1;
+  int Next = -1;
+};
+} // namespace
+
 // Parses the inline content of a single line into a sequence of inline nodes:
-// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or
-// _text_). Runs that match no construct become TextNodes. Emphasis and strong
-// recurse so their content may itself contain inline constructs. Text with no
-// markers yields a single TextNode.
+// inline code (`code`), emphasis (*text* or _text_), and strong (**text** or
+// __text__). Emphasis is resolved with a CommonMark-style delimiter stack: a
+// first pass tokenizes the line into text, code spans, and delimiter runs (each
+// tagged with its flanking flags), then a second pass walks closers back to
+// openers, honoring the multiple-of-three rule. Unmatched runs stay as text.
 //
-// TODO: This covers the common cases but not the full CommonMark §6 inline
-// model (delimiter stacks, intraword underscore rules, links, autolinks).
+// TODO: This does not yet handle links, autolinks, or backslash escapes.
 static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
                                       StringSaver &Saver) {
-  SmallVector<MDNode *> Nodes;
+  SmallVector<InlinePiece> Pool;
+  int Head = -1, Tail = -1;
+
+  auto makePiece = [&]() -> int {
+    Pool.emplace_back();
+    return Pool.size() - 1;
+  };
+  auto linkAtTail = [&](int Idx) {
+    Pool[Idx].Prev = Tail;
+    (Tail != -1 ? Pool[Tail].Next : Head) = Idx;
+    Tail = Idx;
+  };
+  auto appendNode = [&](MDNode *N) {
+    int Idx = makePiece();
+    Pool[Idx].Node = N;
+    linkAtTail(Idx);
+  };
+  // Content nodes pass through; a leftover delimiter run becomes a TextNode of
+  // its remaining characters.
+  auto pieceNode = [&](int P) -> MDNode * {
+    if (Pool[P].Node)
+      return Pool[P].Node;
+    return new (Arena)
+        TextNode(Saver.save(std::string(Pool[P].Len, Pool[P].Ch)));
+  };
+  // Merges adjacent TextNodes so unmatched delimiters coalesce with neighboring
+  // text, then copies the result into the arena.
+  auto finalize = [&](SmallVectorImpl<MDNode *> &Nodes) -> ArrayRef<MDNode *> {
+    SmallVector<MDNode *> Merged;
+    for (MDNode *Nd : Nodes) {
+      if (isa<TextNode>(Nd) && !Merged.empty() &&
+          isa<TextNode>(Merged.back())) {
+        StringRef Prev = cast<TextNode>(Merged.back())->Text;
+        StringRef Cur = cast<TextNode>(Nd)->Text;
+        Merged.back() =
+            new (Arena) TextNode(Saver.save(Prev.str() + Cur.str()));
+      } else {
+        Merged.push_back(Nd);
+      }
+    }
+    return allocateArray(Merged, Arena);
+  };
+
+  // Phase 1: tokenize the line into text, code spans, and delimiter runs.
   CharReader Reader(S);
   size_t TextStart = 0;
-
   auto flushText = [&](size_t End) {
     if (End > TextStart)
-      Nodes.push_back(new (Arena) TextNode(
+      appendNode(new (Arena) TextNode(
           Saver.save(S.substr(TextStart, End - TextStart))));
   };
 
@@ -246,7 +305,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
         flushText(Pos);
         StringRef Code =
             trimCodeSpan(S.substr(Pos + OpenLen, ClosePos - (Pos + OpenLen)));
-        Nodes.push_back(new (Arena) InlineCodeNode(Saver.save(Code)));
+        appendNode(new (Arena) InlineCodeNode(Saver.save(Code)));
         Reader.seek(ClosePos + OpenLen);
         TextStart = Reader.position();
         continue;
@@ -256,38 +315,117 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
       continue;
     }
 
-    // Emphasis (*text*, _text_) and strong (**text**, __text__).
+    // Delimiter run for emphasis or strong.
     if (C == '*' || C == '_') {
-      // Strong binds the two-delimiter form before single-delimiter emphasis.
-      if (Reader.peek(1) == C) {
-        size_t Close = findClosingDelim(S, Pos + 2, C, 2);
-        if (Close != StringRef::npos) {
-          flushText(Pos);
-          StringRef Inner = S.substr(Pos + 2, Close - (Pos + 2));
-          Nodes.push_back(new (Arena)
-                              StrongNode(parseInline(Inner, Arena, Saver)));
-          Reader.seek(Close + 2);
-          TextStart = Reader.position();
-          continue;
+      size_t RunLen = countRun(S, Pos, C);
+      flushText(Pos);
+      char Before = Pos == 0 ? '\0' : S[Pos - 1];
+      char After = Pos + RunLen < S.size() ? S[Pos + RunLen] : '\0';
+      int Idx = makePiece();
+      InlinePiece &D = Pool[Idx];
+      D.Ch = C;
+      D.Len = RunLen;
+      D.OrigLen = RunLen;
+      computeFlanking(Before, C, After, D.CanOpen, D.CanClose);
+      linkAtTail(Idx);
+      Reader.seek(Pos + RunLen);
+      TextStart = Reader.position();
+      continue;
+    }
+
+    Reader.advance();
+  }
+  flushText(S.size());
+
+  // Phase 2: match closers back to openers. OpenersBottom records, per closer
+  // kind, how far back a failed search needs to look, keyed by delimiter char,
+  // run length mod 3, and whether the closer can also open.
+  int OpenersBottom[12];
+  for (int &B : OpenersBottom)
+    B = -1;
+  auto bucket = [](const InlinePiece &P) {
+    return (P.Ch == '_' ? 6 : 0) + (P.OrigLen % 3) * 2 + (P.CanOpen ? 1 : 0);
+  };
+
+  int Current = Head;
+  while (Current != -1) {
+    // Advance to the next run that can close.
+    while (Current != -1 &&
+           !(Pool[Current].Ch && Pool[Current].CanClose && Pool[Current].Len))
+      Current = Pool[Current].Next;
+    if (Current == -1)
+      break;
+    int Closer = Current;
+    int Key = bucket(Pool[Closer]);
+
+    // Search back for the nearest matching opener.
+    int Opener = Pool[Closer].Prev;
+    bool Found = false;
+    while (Opener != -1 && Opener != OpenersBottom[Key]) {
+      InlinePiece &O = Pool[Opener];
+      if (O.Ch == Pool[Closer].Ch && O.Len && O.CanOpen) {
+        unsigned Sum = O.OrigLen + Pool[Closer].OrigLen;
+        bool OddMatch = (O.CanClose || Pool[Closer].CanOpen) && Sum % 3 == 0 &&
+                        !(O.OrigLen % 3 == 0 && Pool[Closer].OrigLen % 3 == 0);
+        if (!OddMatch) {
+          Found = true;
+          break;
         }
       }
-      size_t Close = findClosingDelim(S, Pos + 1, C, 1);
-      if (Close != StringRef::npos) {
-        flushText(Pos);
-        StringRef Inner = S.substr(Pos + 1, Close - (Pos + 1));
-        Nodes.push_back(new (Arena)
-                            EmphasisNode(parseInline(Inner, Arena, Saver)));
-        Reader.seek(Close + 1);
-        TextStart = Reader.position();
-        continue;
-      }
+      Opener = Pool[Opener].Prev;
     }
 
-    Reader.advance();
+    if (!Found) {
+      OpenersBottom[Key] = Pool[Closer].Prev;
+      // A run that cannot also open will never match anything; keep its text
+      // but stop treating it as a delimiter.
+      if (!Pool[Closer].CanOpen)
+        Pool[Closer].CanClose = false;
+      Current = Pool[Closer].Next;
+      continue;
+    }
+
+    // Wrap the pieces between opener and closer, consuming one delimiter from
+    // each side for emphasis or two for strong.
+    unsigned Use = Pool[Opener].Len >= 2 && Pool[Closer].Len >= 2 ? 2 : 1;
+    SmallVector<MDNode *> Inner;
+    for (int P = Pool[Opener].Next; P != Closer; P = Pool[P].Next)
+      Inner.push_back(pieceNode(P));
+    Pool[Opener].Len -= Use;
+    Pool[Closer].Len -= Use;
+    MDNode *Emph =
+        Use == 2
+            ? static_cast<MDNode *>(new (Arena) StrongNode(finalize(Inner)))
+            : static_cast<MDNode *>(new (Arena) EmphasisNode(finalize(Inner)));
+    int EP = makePiece();
+    Pool[EP].Node = Emph;
+    Pool[EP].Prev = Opener;
+    Pool[EP].Next = Closer;
+    Pool[Opener].Next = EP;
+    Pool[Closer].Prev = EP;
+
+    // Drop the opener or closer once its run is fully consumed.
+    if (Pool[Opener].Len == 0) {
+      int Pr = Pool[Opener].Prev;
+      Pool[EP].Prev = Pr;
+      (Pr != -1 ? Pool[Pr].Next : Head) = EP;
+    }
+    if (Pool[Closer].Len == 0) {
+      int Nx = Pool[Closer].Next;
+      Pool[EP].Next = Nx;
+      (Nx != -1 ? Pool[Nx].Prev : Tail) = EP;
+      Current = Nx;
+    } else {
+      Current = Closer;
+    }
   }
 
-  flushText(S.size());
-  return allocateArray(Nodes, Arena);
+  // Phase 3: collect the surviving pieces, dropping fully consumed delimiters.
+  SmallVector<MDNode *> Result;
+  for (int P = Head; P != -1; P = Pool[P].Next)
+    if (Pool[P].Node || Pool[P].Len)
+      Result.push_back(pieceNode(P));
+  return finalize(Result);
 }
 
 // Parses a fenced code block opened with ``` or ~~~. The cursor must be on the
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index aedcd9407b197..49e61e8c129fa 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -590,18 +590,14 @@ TEST_F(MarkdownParserTest, IntrawordStrongAsterisk) {
   EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bar");
 }
 
-// CommonMark §6.2 Example 360: intraword underscores do NOT open emphasis, so
-// "foo_bar_" is literal text. DIVERGENCE: this parser lacks the intraword
-// underscore rule (see the findClosingDelim TODO) and treats it as emphasis.
-TEST_F(MarkdownParserTest, IntrawordUnderscoreEmphasisDivergence) {
+// CommonMark §6.2 Example 360: intraword underscores do not open or close
+// emphasis, so "foo_bar_" stays as literal text.
+TEST_F(MarkdownParserTest, IntrawordUnderscoreIsText) {
   auto Nodes = parseMarkdown("foo_bar_", Arena);
   ASSERT_EQ(Nodes.size(), 1u);
   auto *P = cast<ParagraphNode>(Nodes[0]);
-  ASSERT_EQ(P->Children.size(), 2u);
-  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo");
-  auto *Em = cast<EmphasisNode>(P->Children[1]);
-  ASSERT_EQ(Em->Children.size(), 1u);
-  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+  ASSERT_EQ(P->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "foo_bar_");
 }
 
 // CommonMark §6.1 Example 331: a code span strips one leading and trailing

>From 3e6c805a5450fb844e4c59dd6c2f0bca0fdef3eb Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 13:05:41 -0400
Subject: [PATCH 24/27] [clang-doc] Add delimiter stack edge case tests for
 nested emphasis and rule-of-three

---
 .../clang-doc/MarkdownParserTest.cpp          | 65 +++++++++++++++++++
 1 file changed, 65 insertions(+)

diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 49e61e8c129fa..5571478578d93 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -610,6 +610,71 @@ TEST_F(MarkdownParserTest, CodeSpanStripsSurroundingSpaces) {
   EXPECT_EQ(cast<InlineCodeNode>(P->Children[0])->Code, "x");
 }
 
+// CommonMark §6.2 Example 413: a triple run splits across two matches, the
+// inner pair forming strong and the outer pair emphasis, so "***foo***" is
+// emphasis wrapping strong. The old findClosingDelim search matched a run only
+// against an equal-length run and could not split one this way.
+TEST_F(MarkdownParserTest, TripleDelimiterBoldItalic) {
+  auto Nodes = parseMarkdown("***foo***", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  auto *Em = cast<EmphasisNode>(P->Children[0]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  auto *St = cast<StrongNode>(Em->Children[0]);
+  ASSERT_EQ(St->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "foo");
+}
+
+// CommonMark §6.2: emphasis containing a strong span, "*foo **bar** baz*". The
+// outer emphasis spans delimiter runs of two different lengths, which the
+// equal-length findClosingDelim search could not pair.
+TEST_F(MarkdownParserTest, MixedDelimitersEmStrongEm) {
+  auto Nodes = parseMarkdown("*foo **bar** baz*", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  auto *Em = cast<EmphasisNode>(P->Children[0]);
+  ASSERT_EQ(Em->Children.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "foo ");
+  auto *St = cast<StrongNode>(Em->Children[1]);
+  ASSERT_EQ(St->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "bar");
+  EXPECT_EQ(cast<TextNode>(Em->Children[2])->Text, " baz");
+}
+
+// CommonMark §6.2: strong containing emphasis with text on both sides,
+// "**foo *bar* baz**". The inner emphasis closes before the outer strong does,
+// which the single forward scan handled only by coincidence of nesting order.
+TEST_F(MarkdownParserTest, NestedEmphasisInsideStrong) {
+  auto Nodes = parseMarkdown("**foo *bar* baz**", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 1u);
+  auto *St = cast<StrongNode>(P->Children[0]);
+  ASSERT_EQ(St->Children.size(), 3u);
+  EXPECT_EQ(cast<TextNode>(St->Children[0])->Text, "foo ");
+  auto *Em = cast<EmphasisNode>(St->Children[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+  EXPECT_EQ(cast<TextNode>(St->Children[2])->Text, " baz");
+}
+
+// CommonMark §6.2 rule of three: when a closer can also open, it may not match
+// an opener whose run length sums with the closer's to a multiple of three. In
+// "**foo*bar*" the leading ** cannot close against the inner *, so ** stays
+// literal and only *bar* becomes emphasis.
+TEST_F(MarkdownParserTest, MultipleOfThreeBlocksClose) {
+  auto Nodes = parseMarkdown("**foo*bar*", Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *P = cast<ParagraphNode>(Nodes[0]);
+  ASSERT_EQ(P->Children.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(P->Children[0])->Text, "**foo");
+  auto *Em = cast<EmphasisNode>(P->Children[1]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "bar");
+}
+
 TEST_F(MarkdownParserTest, BlockQuote) {
   auto Nodes = parseMarkdown("> hello", Arena);
   ASSERT_EQ(Nodes.size(), 1u);

>From 518cb8c4205309e71711a11cea0ccaec61a96f28 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 13:33:14 -0400
Subject: [PATCH 25/27] [clang-doc] Parse table cells with inline content in
 pipe tables

---
 .../clang-doc/support/Markdown.cpp            | 42 +++++++++++++++----
 .../clang-doc/support/Markdown.h              | 25 +++++++----
 .../clang-doc/MarkdownParserTest.cpp          | 42 ++++++++++++++++++-
 3 files changed, 93 insertions(+), 16 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 9ce5339fc8cb6..4e07c56d9a4ad 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -460,18 +460,44 @@ static FencedCodeNode *parseFencedCode(LineReader &Reader,
   return Code;
 }
 
+// Splits a pipe table row into cell texts. A single optional leading and
+// trailing pipe are dropped, then the remainder is split on '|' and each cell
+// is trimmed.
+// TODO: A '|' inside a code span or escaped as "\|" should not split a cell.
+static void splitTableRow(StringRef Row, SmallVectorImpl<StringRef> &Cells) {
+  Row = Row.trim();
+  if (Row.starts_with("|"))
+    Row = Row.drop_front();
+  if (Row.ends_with("|"))
+    Row = Row.drop_back();
+  SmallVector<StringRef> Parts;
+  Row.split(Parts, '|');
+  for (StringRef Part : Parts)
+    Cells.push_back(Part.trim());
+}
+
 // Parses a pipe table. The cursor must be on the header row, with a separator
-// row following; consecutive lines containing a | are taken as rows.
+// row following; consecutive lines containing a | are taken as body rows. Each
+// cell's text is parsed into inline nodes.
 static TableNode *parsePipeTable(LineReader &Reader, BumpPtrAllocator &Arena,
                                  StringSaver &Saver) {
-  SmallVector<StringRef> Rows;
-  // TODO: Rows are kept as raw line text for now. Table cells may contain
-  // inline content (emphasis, code spans, links), so each row may need to be
-  // split on '|' and parsed further into structured cells.
+  auto parseRow = [&](StringRef Line) -> TableRow {
+    SmallVector<StringRef> CellTexts;
+    splitTableRow(Line, CellTexts);
+    SmallVector<TableCell> Cells;
+    for (StringRef Text : CellTexts)
+      Cells.push_back(TableCell{parseInline(Text, Arena, Saver)});
+    return TableRow{allocateArray(Cells, Arena)};
+  };
+
+  TableRow Header = parseRow(Reader.advance());
+  Reader.advance(); // skip the alignment separator row
+  SmallVector<TableRow> Body;
   while (!Reader.atEnd() && Reader.peek().trim().contains('|'))
-    Rows.push_back(Saver.save(Reader.advance().trim()));
-  auto *Table = new (Arena) TableNode(allocateArray(Rows, Arena));
-  LDBG() << "emitting TableNode rows=" << Rows.size();
+    Body.push_back(parseRow(Reader.advance()));
+  auto *Table = new (Arena) TableNode(Header, allocateArray(Body, Arena));
+  LDBG() << "emitting TableNode header_cells=" << Header.Cells.size()
+         << " body_rows=" << Body.size();
   return Table;
 }
 
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index a9b00a5c10225..622f3cbb3fc63 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -26,7 +26,7 @@
 ///   ParagraphNode: sequence of inline nodes
 ///   HeadingNode: ATX heading (# through ######), level 1-6
 ///   FencedCodeNode: fenced code block (``` or ~~~)
-///   TableNode: pipe table (raw row text; TODO: structured cells)
+///   TableNode: pipe table (a header row and body rows of cells)
 ///   UnorderedListNode: bullet list (-, *, +)
 ///   OrderedListNode: numbered list with explicit start number
 ///   ListItemNode: single item inside a list
@@ -182,13 +182,24 @@ struct FencedCodeNode : MDNode {
   }
 };
 
-/// Pipe table. Rows contains the raw text of each row line including the
-/// header and separator rows.
-/// TODO: replace with a structured header/body/cell representation.
+/// A single table cell. Children holds the cell's parsed inline content.
+struct TableCell {
+  llvm::ArrayRef<MDNode *> Children;
+};
+
+/// A table row, split into cells on the row's pipe characters.
+struct TableRow {
+  llvm::ArrayRef<TableCell> Cells;
+};
+
+/// Pipe table. Header is the first row; Body holds the rows following the
+/// alignment separator. Each cell's text is parsed into inline nodes.
+/// TODO: capture per-column alignment from the separator row.
 struct TableNode : MDNode {
-  llvm::ArrayRef<llvm::StringRef> Rows;
-  explicit TableNode(llvm::ArrayRef<llvm::StringRef> Rows)
-      : MDNode(NodeKind::NK_Table), Rows(Rows) {}
+  TableRow Header;
+  llvm::ArrayRef<TableRow> Body;
+  TableNode(TableRow Header, llvm::ArrayRef<TableRow> Body)
+      : MDNode(NodeKind::NK_Table), Header(Header), Body(Body) {}
   static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Table; }
 };
 
diff --git a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
index 5571478578d93..d07f1fe1a92e0 100644
--- a/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
+++ b/clang-tools-extra/unittests/clang-doc/MarkdownParserTest.cpp
@@ -78,7 +78,47 @@ TEST_F(MarkdownParserTest, PipeTable) {
 | 1 | 2 |)",
                              Arena);
   ASSERT_EQ(Nodes.size(), 1u);
-  EXPECT_TRUE(isa<TableNode>(Nodes[0]));
+  auto *T = cast<TableNode>(Nodes[0]);
+  ASSERT_EQ(T->Header.Cells.size(), 2u);
+  ASSERT_EQ(T->Header.Cells[0].Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(T->Header.Cells[0].Children[0])->Text, "A");
+  ASSERT_EQ(T->Header.Cells[1].Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(T->Header.Cells[1].Children[0])->Text, "B");
+  ASSERT_EQ(T->Body.size(), 1u);
+  ASSERT_EQ(T->Body[0].Cells.size(), 2u);
+  EXPECT_EQ(cast<TextNode>(T->Body[0].Cells[0].Children[0])->Text, "1");
+  EXPECT_EQ(cast<TextNode>(T->Body[0].Cells[1].Children[0])->Text, "2");
+}
+
+// A table cell's text runs through the inline parser, so emphasis inside a cell
+// becomes an EmphasisNode rather than literal text.
+TEST_F(MarkdownParserTest, TableCellWithEmphasis) {
+  auto Nodes = parseMarkdown(R"(| *a* | b |
+|---|---|
+| c | d |)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *T = cast<TableNode>(Nodes[0]);
+  ASSERT_EQ(T->Header.Cells.size(), 2u);
+  ASSERT_EQ(T->Header.Cells[0].Children.size(), 1u);
+  auto *Em = cast<EmphasisNode>(T->Header.Cells[0].Children[0]);
+  ASSERT_EQ(Em->Children.size(), 1u);
+  EXPECT_EQ(cast<TextNode>(Em->Children[0])->Text, "a");
+  EXPECT_EQ(cast<TextNode>(T->Header.Cells[1].Children[0])->Text, "b");
+}
+
+// A code span inside a table cell becomes an InlineCodeNode.
+TEST_F(MarkdownParserTest, TableCellWithInlineCode) {
+  auto Nodes = parseMarkdown(R"(| `x` | y |
+|---|---|
+| z | w |)",
+                             Arena);
+  ASSERT_EQ(Nodes.size(), 1u);
+  auto *T = cast<TableNode>(Nodes[0]);
+  ASSERT_EQ(T->Header.Cells.size(), 2u);
+  ASSERT_EQ(T->Header.Cells[0].Children.size(), 1u);
+  EXPECT_EQ(cast<InlineCodeNode>(T->Header.Cells[0].Children[0])->Code, "x");
+  EXPECT_EQ(cast<TextNode>(T->Body[0].Cells[0].Children[0])->Text, "z");
 }
 
 TEST_F(MarkdownParserTest, PipeCharacterWithoutSepRowIsPlainText) {

>From 2319b205870a405d501ddde6d39bdbbbb8d3572a Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 16:32:44 -0400
Subject: [PATCH 26/27] [clang-doc] Trim over-explained comments and fix
 inaccurate parseMarkdown doc

---
 clang-tools-extra/clang-doc/support/Markdown.cpp | 16 +++++-----------
 clang-tools-extra/clang-doc/support/Markdown.h   |  6 +-----
 2 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/clang-tools-extra/clang-doc/support/Markdown.cpp b/clang-tools-extra/clang-doc/support/Markdown.cpp
index 4e07c56d9a4ad..079ae5d12e9f2 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.cpp
+++ b/clang-tools-extra/clang-doc/support/Markdown.cpp
@@ -89,10 +89,8 @@ static unsigned atxHeadingLevel(StringRef Line) {
   return Level;
 }
 
-// A forward cursor over the lines of a paragraph. Encapsulates the parse
-// position so the loop can inspect the current or an upcoming line and consume
-// lines without manual index arithmetic. Lines are stored untrimmed; callers
-// trim where they need a normalized view.
+// A forward cursor over the lines of a paragraph. Lines are stored untrimmed;
+// callers trim where they need a normalized view.
 class LineReader {
 public:
   explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
@@ -125,11 +123,8 @@ class LineReader {
   size_t Pos = 0;
 };
 
-// A forward cursor over the characters of a string. The character-level analog
-// of LineReader: the inline scanner inspects the current or an upcoming
-// character and consumes characters without manual index arithmetic. position()
-// and seek() let it interoperate with the index-based run and delimiter helpers
-// below, since inline constructs are not consumed one character at a time.
+// A forward cursor over the characters of a string. position() and seek() let
+// it interoperate with the index-based run and delimiter helpers below.
 class CharReader {
 public:
   explicit CharReader(StringRef S) : S(S) {}
@@ -434,8 +429,7 @@ static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena,
 // TODO: Follow CommonMark spec §4.5 more closely. Opening fences may be
 // indented up to 3 spaces, the closing fence must use the same character and be
 // at least as long as the opening fence, and the closing fence may only be
-// followed by spaces. Doxygen specifics should be handled on a case-by-case
-// basis.
+// followed by spaces.
 static FencedCodeNode *parseFencedCode(LineReader &Reader,
                                        BumpPtrAllocator &Arena,
                                        StringSaver &Saver) {
diff --git a/clang-tools-extra/clang-doc/support/Markdown.h b/clang-tools-extra/clang-doc/support/Markdown.h
index 622f3cbb3fc63..01027c170e51e 100644
--- a/clang-tools-extra/clang-doc/support/Markdown.h
+++ b/clang-tools-extra/clang-doc/support/Markdown.h
@@ -168,10 +168,6 @@ struct HeadingNode : MDNode {
 /// Fenced code block opened with ``` or ~~~. Lang is the info string (e.g.
 /// "cpp"); empty when no language was specified. Lines contains the raw text
 /// of each interior line, without the opening or closing fence.
-///
-/// TODO: Follow CommonMark spec §4.5. The opening fence may be indented up
-/// to 3 spaces; the closing fence must use the same character and be at least
-/// as long as the opening fence; only spaces may follow the closing fence.
 struct FencedCodeNode : MDNode {
   llvm::StringRef Lang;
   llvm::ArrayRef<llvm::StringRef> Lines;
@@ -259,7 +255,7 @@ struct ThematicBreakNode : MDNode {
 /// Parse Markdown from a single paragraph of plain text. Returns a list of
 /// top-level nodes allocated in Arena. Returns an empty ArrayRef only for empty
 /// or whitespace-only input; plain text with no Markdown constructs returns a
-/// single TextNode.
+/// single ParagraphNode.
 ///
 /// The caller must keep Arena alive for the lifetime of any returned nodes.
 llvm::ArrayRef<MDNode *> parseMarkdown(llvm::StringRef ParagraphText,

>From 6a3d7c120676e25bc48aa1aae53cce1c00f8d8a8 Mon Sep 17 00:00:00 2001
From: Neil-N4 <neilnair4 at gmail.com>
Date: Sat, 13 Jun 2026 16:32:51 -0400
Subject: [PATCH 27/27] [clang-doc] Trim over-explained comments and fix
 inaccurate parseMarkdown doc

---
 .gitignore                    |   1 +
 CLANG_COMMENT_PARSER_NOTES.md | 175 +++++++++++++++++++++++++++
 MARKDOWN_PARSER_RESEARCH.md   | 220 ++++++++++++++++++++++++++++++++++
 followed                      |   0
 prefix                        |   0
 5 files changed, 396 insertions(+)
 create mode 100644 CLANG_COMMENT_PARSER_NOTES.md
 create mode 100644 MARKDOWN_PARSER_RESEARCH.md
 create mode 100644 followed
 create mode 100644 prefix

diff --git a/.gitignore b/.gitignore
index 9d4e86ab10caa..5addf57e504b5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,3 +88,4 @@ pythonenv*
 /clang/utils/analyzer/projects/*/RefScanBuildResults
 # automodapi puts generated documentation files here.
 /lldb/docs/python_api/
+GSOC_CONTEXT.md
diff --git a/CLANG_COMMENT_PARSER_NOTES.md b/CLANG_COMMENT_PARSER_NOTES.md
new file mode 100644
index 0000000000000..6e7193368e2f9
--- /dev/null
+++ b/CLANG_COMMENT_PARSER_NOTES.md
@@ -0,0 +1,175 @@
+# Clang Comment Parser Notes (Markdown integration scouting)
+
+Scope: understand how Clang turns a doc comment into a comment AST, where
+paragraph boundaries get drawn, and where the standalone Markdown library could
+eventually hook in. Read against `clang/lib/AST/CommentLexer.cpp`,
+`clang/lib/AST/CommentParser.cpp`, and `clang/include/clang/AST/Comment.h`.
+
+The pipeline is three stages: raw text -> tokens (lexer) -> comment AST
+(parser, via Sema) -> consumer (clang-doc's serializer today).
+
+## 1. Tokenization (CommentLexer)
+
+The lexer emits a flat stream of comment tokens. The token kinds live in
+`tok::TokenKind` (`clang/include/clang/AST/CommentLexer.h:32`):
+
+- `eof`, `newline`, `text` -- the common cases.
+- `unknown_command`, `backslash_command`, `at_command` -- `\foo` / `@foo`.
+- `verbatim_block_begin`, `verbatim_block_line`, `verbatim_block_end` --
+  `\code ... \endcode` and friends.
+- `verbatim_line_name`, `verbatim_line_text` -- single-line verbatim commands.
+- `html_start_tag`, `html_ident`, `html_equals`, `html_quoted_string`,
+  `html_greater`, `html_slash_greater`, `html_end_tag` -- raw HTML passthrough.
+
+Key behaviors:
+
+- The lexer runs a two-level state machine: a comment state (`//` vs `/* */`)
+  and a lexing state (`LS_Normal`, verbatim block/line states, HTML states),
+  both declared in `CommentLexer.h`.
+- Plain text and newlines: `Lexer::lexCommentText()`
+  (`CommentLexer.cpp:305`) walks a line, using `skipTextToken()`
+  (`CommentLexer.cpp:282`) to find the next boundary and emitting a single
+  `tok::text` for the run, then a `tok::newline` at each line end. Newlines are
+  their own tokens -- this matters for paragraph detection downstream.
+- Leading decoration is stripped per line by `skipLineStartingDecorations()`
+  (`CommentLexer.cpp:90`): horizontal whitespace plus a single leading `*` in
+  `/* */` comments. It is called after each newline (`CommentLexer.cpp:322`).
+  So by the time text reaches the parser, the `*` gutter and `///` markers are
+  already gone, but inner text is otherwise verbatim (interior spaces kept).
+- Verbatim blocks are NOT decomposed. `setupAndLexVerbatimBlock()`
+  (`CommentLexer.cpp:468`) and `lexVerbatimBlockFirstLine()`
+  (`CommentLexer.cpp:493`) emit one `tok::verbatim_block_line` per line with the
+  raw line text preserved. This is Doxygen's existing `\code` path: content is
+  already kept intact, not re-lexed.
+
+## 2. Parsing and paragraph boundaries (CommentParser)
+
+Entry point: `Parser::parseFullComment()` (`CommentParser.cpp:923`) returns a
+`FullComment *`. It skips leading newlines, then loops calling
+`parseBlockContent()` until `eof`, collecting `BlockContentComment *` blocks
+(`CommentParser.cpp:928-935`).
+
+`parseBlockContent()` (`CommentParser.cpp:892`) dispatches on the current token:
+text / command / HTML -> `parseParagraphOrBlockCommand()`;
+`verbatim_block_begin` -> `parseVerbatimBlock()`;
+`verbatim_line_name` -> `parseVerbatimLine()`.
+
+Paragraph assembly and boundaries happen in
+`parseParagraphOrBlockCommand()` (`CommentParser.cpp:719`). It accumulates a
+`SmallVector<InlineContentComment *> Content` and ends the paragraph on:
+
+- `verbatim_block_begin`, `verbatim_line_name`, or `eof`
+  (`CommentParser.cpp:724-727`).
+- A block command (`Info->IsBlockCommand`), e.g. `\brief`, `\param`
+  (`CommentParser.cpp:739-743`). If the paragraph is still empty it becomes a
+  block command instead (`parseBlockCommand()`).
+- **A blank line.** This is the core boundary rule
+  (`CommentParser.cpp:765-786`): on a `tok::newline`, peek the next token; two
+  consecutive newlines (or `newline`, whitespace-only `text`, `newline`) end
+  the paragraph. A single newline is non-terminating -- it just calls
+  `addTrailingNewline()` on the last inline node and continues.
+
+So paragraph boundaries are purely lexical (blank line / block command / EOF),
+decided here and nowhere else. There is no Markdown-aware blocking: a fenced
+code block written with ```` ``` ```` is, to this parser, just `text` lines
+inside one paragraph (unless the author used `\code`, which becomes a verbatim
+block instead).
+
+Inline content within a paragraph is built through Sema factory calls, not
+constructors: `S.actOnText()` for `tok::text` (`CommentParser.cpp:798`),
+`parseInlineCommand()` for inline `\foo`, `parseHTMLStartTag()` /
+`parseHTMLEndTag()` for HTML. The finished paragraph is created by
+`S.actOnParagraphComment(S.copyArray(ArrayRef(Content)))`
+(`CommentParser.cpp:817`).
+
+## 3. The comment AST (Comment.h)
+
+Two abstract bases under `Comment`:
+
+- `InlineContentComment` (`Comment.h:271`): `TextComment` (`Comment.h:297`),
+  `InlineCommandComment`, `HTMLStartTagComment`, `HTMLEndTagComment`.
+- `BlockContentComment` (`Comment.h:559`): `ParagraphComment` (`Comment.h:576`),
+  `BlockCommandComment` (and its `ParamCommandComment` / `TParamCommandComment`),
+  `VerbatimBlockComment` (`Comment.h:900`), `VerbatimLineComment`.
+
+`FullComment` (`Comment.h:1106`) is the top node, holding the block list.
+
+For Markdown, the text we care about lives in:
+
+- `ParagraphComment::Content` -- `ArrayRef<InlineContentComment *>`
+  (`Comment.h:577`), iterable via `child_begin()`/`child_end()`.
+- `TextComment::getText()` -> `StringRef` (`Comment.h:315`). This is the leaf
+  source of all plain text. Inline command and HTML nodes wrap/annotate text
+  but don't hold paragraph prose directly.
+
+To reconstruct a paragraph's plain text you walk `ParagraphComment`'s children
+and concatenate each `TextComment::getText()` (re-inserting the soft newlines
+the lexer split on). That is exactly the shape `parseMarkdown()` already
+expects: one paragraph of plain text in, block nodes out.
+
+## 4. Where clang-doc consumes this today
+
+clang-doc does not touch the comment parser directly. In `Serialize.cpp` it
+calls `RawComment::parse()` to get a `FullComment`
+(`Serialize.cpp:611, 931`), then `Serializer::parseFullComment()`
+(`Serialize.cpp:385`) runs `ClangDocCommentVisitor` (`Serialize.cpp:204`),
+which mirrors the comment AST into clang-doc's own `CommentInfo` tree
+(`visitTextComment`, etc.). `CommentInfo` is what the generators see.
+
+The current JSON Markdown integration (on `md-json-integration`) parses at the
+**generator** level: in `JSONGenerator.cpp`, when a `CK_ParagraphComment`'s
+children are all `CK_TextComment`, it concatenates their text and runs
+`parseMarkdown()`. So the prose has already made a round trip through tokens ->
+`FullComment` -> `CommentInfo` before we parse it.
+
+## 5. Candidate hook points (shallow to deep)
+
+1. **Generator level (current).** Parse in `JSONGenerator.cpp` over
+   `CommentInfo`. Pros: isolated, no Clang AST changes, already working. Cons:
+   clang-doc-only; every other comment consumer (the other generators, anything
+   in Clang) gets nothing; relies on `CommentInfo` faithfully preserving the
+   text and on the all-text-children heuristic.
+
+2. **clang-doc serializer level.** Parse inside `ClangDocCommentVisitor` when a
+   `ParagraphComment` is visited (`Serialize.cpp:~204-249`), attaching parsed
+   Markdown to `CommentInfo` once for all generators. Pros: every clang-doc
+   backend benefits, single place, still no core-Clang change. Cons: still
+   clang-doc-only; needs a `CommentInfo` field to carry the parsed tree.
+
+3. **Comment parser / Sema level (long-term goal).** Teach the comment layer
+   itself about Markdown. The clean seam is `parseParagraphOrBlockCommand()`
+   (`CommentParser.cpp:719`) / `actOnParagraphComment()`: a paragraph's
+   collected text is available right before the node is built, and paragraph
+   boundaries are already computed here. A Markdown pass could run over each
+   `ParagraphComment`'s text (or over `FullComment` as a post-pass) and produce
+   structured nodes shared by all of Clang, not just clang-doc. Cons: largest
+   blast radius (touches a core AST that many tools depend on), needs new AST
+   node types or a side table, and has to coexist with the existing Doxygen
+   constructs (`\code` verbatim blocks, inline commands, HTML) rather than
+   re-interpreting them.
+
+### Practical notes for whichever hook
+
+- Verbatim blocks already solve the fenced-code case via `\code`/`\endcode`;
+  Markdown ```` ``` ```` fences arrive as ordinary `text` lines. A hook should
+  decide whether to treat ```` ``` ```` as code (parse it) or defer to existing
+  `\code` blocks, and per Erick's guidance, skip paragraphs that already carry
+  Doxygen code tags.
+- Paragraph text reaches us with the `///` and `*` gutters stripped
+  (`skipLineStartingDecorations`, `CommentLexer.cpp:90`) but soft newlines
+  intact as separate tokens, so reassembly needs to re-join lines with `\n`,
+  which is what the generator hook already does.
+- Boundaries are blank-line based, so a multi-line Markdown construct (table,
+  list, fenced block) survives as a single paragraph only if it has no blank
+  lines inside it. Anything separated by a blank line is already a distinct
+  `ParagraphComment` and would be parsed independently.
+
+## Open questions / follow-ups
+
+- Does `CommentInfo` preserve enough to reconstruct the original line breaks
+  losslessly for option 1/2, or do we need to carry the raw paragraph text
+  through? (Check what `visitTextComment` stores vs. drops.)
+- For option 3, would mentors want new `comments::` AST nodes, or a separate
+  Markdown tree hung off `FullComment` as a side channel?
+- How should this interact with `-fparse-all-comments` and with non-Doxygen
+  comment styles?
diff --git a/MARKDOWN_PARSER_RESEARCH.md b/MARKDOWN_PARSER_RESEARCH.md
new file mode 100644
index 0000000000000..c6581e881357e
--- /dev/null
+++ b/MARKDOWN_PARSER_RESEARCH.md
@@ -0,0 +1,220 @@
+# Markdown Parser Research: Block vs Inline Split and Inline Scanners
+
+Scope: how three established Markdown parsers (cmark, pulldown-cmark, goldmark)
+separate block parsing from inline parsing, how their character-level inline
+scanners are built, and which patterns are worth pulling into the clang-doc
+parser (`clang-tools-extra/clang-doc/support/Markdown.cpp`).
+
+Sources read:
+- cmark `src/blocks.c` and `src/inlines.c` (reference CommonMark C library)
+- pulldown-cmark `pulldown-cmark/src/parse.rs` (Rust pull parser)
+- goldmark `parser/parser.go` (Go CommonMark parser)
+
+## The common architecture: two phases
+
+All three parsers make the same top-level decision: **build the block tree
+first from raw line spans, then parse inline content in a second pass.** Block
+parsing never looks inside a run of text for emphasis or code; it only decides
+where paragraphs, code fences, lists, headings, and quotes begin and end. Inline
+parsing runs afterward over the text that the block phase collected.
+
+This split exists because block boundaries are decided line by line (a blank
+line ends a paragraph, a fence line opens a code block) while inline structure is
+decided character by character within already-delimited text. Keeping them
+separate means the inline scanner can assume it is looking at a single
+self-contained run of text with no block transitions to worry about.
+
+---
+
+## cmark (C)
+
+### Block / inline split
+- Lines are fed through `S_parser_feed` -> `S_process_line`. Each line runs
+  three steps: `check_open_blocks` (do existing open containers continue?),
+  `open_new_blocks` (does a new block start here?), and
+  `add_text_to_container` (append the raw line text to the matched block).
+- Open blocks are tracked as a tree of `cmark_node` with a
+  `CMARK_NODE__OPEN` flag and a `parser->current` pointer. `add_child` walks up
+  the tree until it finds a parent that can accept the new child.
+- Block boundaries are detected by small dedicated scanners:
+  `scan_open_code_fence` / `scan_close_code_fence`, `scan_atx_heading_start`,
+  `parse_list_marker`, `parse_block_quote_prefix`, `S_scan_thematic_break`,
+  `scan_setext_heading_line`. Indentation is normalized by
+  `S_find_first_nonspace` and `S_advance_offset` (with tab expansion).
+- Inline parsing is explicitly deferred. `cmark_parser_finish` calls
+  `finalize_document`, then `process_inlines` iterates the finished tree with
+  `cmark_iter_new` and, for blocks where `contains_inlines()` is true
+  (paragraphs and headings only), calls `cmark_parse_inlines`. After a block's
+  inlines are parsed, its raw text buffer is freed.
+
+### Inline scanner
+- The scanner is a character loop over a `subject` struct that holds the input
+  chunk and a `pos` cursor, plus a `last_delim` delimiter stack pointer and a
+  `last_bracket` pointer for links.
+- `parse_inline` reads the current byte with `peek_char` and dispatches with a
+  switch: backtick -> `handle_backticks`, backslash -> `handle_backslash`,
+  `*` `_` `'` `"` -> `handle_delim`, `[` / `]` -> bracket handling, and a
+  default case that grabs a whole text run.
+- Text runs are accumulated in bulk: `subject_find_special_char` scans forward
+  using a lookup table of special bytes (`\r\n\\` `` ` `` `&_*[]<!`) and the
+  default case copies everything up to the next special byte into one text node.
+  The scanner does not advance one character at a time through ordinary text.
+- Code spans: `handle_backticks` records the opening backtick run length, then
+  `scan_to_closing_backticks` searches forward for a run of identical length.
+  Content is normalized by `S_normalize_code` (newlines to spaces, strip one
+  leading and trailing space).
+- Emphasis: a **delimiter stack**. `scan_delims` computes left/right flanking
+  (`can_open`, `can_close`) from the surrounding character classes.
+  `push_delimiter` records each `*`/`_` run. After the run is scanned,
+  `process_emphasis` walks the stack backward pairing closers with openers,
+  honoring the length-mod-3 rule via an `openers_bottom` table, and
+  `S_insert_emph` builds the EMPH/STRONG nodes and splices children between the
+  matched delimiters.
+
+---
+
+## pulldown-cmark (Rust)
+
+### Block / inline split
+- Self-described as a "tree-based two pass parser." `run_first_pass` builds a
+  `Tree<Item>` of block structure. Each `Item` stores `start`/`end` byte offsets
+  into the original source rather than copied strings.
+- The second pass is lazy: during event iteration, when the cursor reaches an
+  item whose body `is_maybe_inline()`, it calls `handle_inline`, which scans that
+  item's byte range for inline constructs and rewrites the tree in place.
+- `ItemBody` is an enum that distinguishes block kinds (`Paragraph`, `Heading`,
+  `List`) from unresolved inline markers (`MaybeEmphasis`, `MaybeCode`,
+  `MaybeLinkOpen`) and resolved inlines (`Code`, `Link`, `Emphasis`). The first
+  pass emits the "Maybe" variants; the inline pass resolves them.
+
+### Inline scanner
+- Two stages. `handle_inline_pass1` resolves code spans, HTML, and links first
+  (constructs whose interior must not be reprocessed), walking sibling tree nodes
+  via `tree[cur].next`. `handle_emphasis_and_hard_break` then resolves emphasis.
+- Emphasis uses an `InlineStack` of `InlineEl { start, count, run_length, c,
+  both }`. On a closer, `find_match` searches the stack backward for a
+  compatible opener, applying the same `(count + el.count) % 3 != 0` rule cmark
+  uses, and nests matches from the inside out. Leftover delimiters become text.
+- Code spans use a `CodeDelims` map keyed by backtick-run length
+  (`HashMap<usize, VecDeque<TreeIndex>>`), so a closing run of a given length can
+  be found without rescanning. `make_code_span` extracts and normalizes the span
+  and interns it through an allocation pool (`allocate_cow`).
+- The notable structural idea is that nodes carry source spans, and the inline
+  pass mutates tree indices in place rather than allocating a fresh node list.
+
+---
+
+## goldmark (Go)
+
+### Block / inline split
+- Interface driven. `BlockParser` has `Open` / `Continue` / `Close`;
+  `InlineParser` has `Parse` and `Trigger`. `Parse()` runs `parseBlocks` to build
+  the block AST, then `walkBlock` does a post-order traversal calling
+  `parseBlock`, which performs inline parsing on each finished block.
+- Open blocks are tracked as `openedBlocks []Block` (each a node plus its
+  parser) on the parser `Context`. Per line, open blocks try to `Continue`; if
+  they decline, `openBlocks` tries to start new ones. This is what implements
+  lazy paragraph continuation.
+
+### Inline scanner
+- Dispatch is table driven. Goldmark keeps a `[256][]InlineParser` array indexed
+  by trigger byte. Each `InlineParser` registers the bytes it cares about via
+  `Trigger()`. During `parseBlock`, the current byte selects
+  `inlineParsers[parserChar]` and only those parsers run, instead of testing
+  every parser at every position.
+- Emphasis uses a doubly-linked delimiter list on the `Context`
+  (`delimiters`, `lastDelimiter`) with `PushDelimiter`, `RemoveDelimiter`,
+  `ClearDelimiters`. `ProcessDelimiters` runs at block end to pair openers and
+  closers and build emphasis nodes, again decoupled from the initial scan.
+
+---
+
+## Cross-cutting comparison
+
+| Concern | cmark | pulldown-cmark | goldmark | clang-doc (current) |
+|---|---|---|---|---|
+| Block vs inline | block tree, then `process_inlines` | first pass tree, lazy inline pass | `parseBlocks` then `walkBlock`/`parseBlock` | block loop per line, inline run per plain-text line inline in the loop |
+| Inline dispatch | switch on current byte | enum rewrite over tree | `[256][]InlineParser` trigger table | if-chain on `` ` `` `*` `_` |
+| Text runs | bulk scan to next special byte | span offsets on tree items | parser-driven segments | char-by-char with a `flushText` lambda |
+| Code spans | matching backtick run | length-keyed delimiter map | trigger parser | matching backtick run (already aligned) |
+| Emphasis | delimiter stack + flanking + mod-3 | `InlineStack` + `find_match` + mod-3 | delimiter linked list + `ProcessDelimiters` | recursive `findClosingDelim` forward search (simplified subset) |
+| Text storage | raw buffer, freed after inline | source byte spans | reader segments | copied/interned via `StringSaver` |
+
+The three production parsers converge on the same two ideas: a deferred inline
+pass over text the block phase collected, and a delimiter-stack emphasis
+algorithm with flanking rules and the length-mod-3 opener constraint.
+
+---
+
+## Relevance to our LLVM clang-doc parser
+
+Current state of `support/Markdown.cpp`:
+- `parseMarkdown` is the block phase. It splits the paragraph into lines, walks
+  them with the `LineReader` cursor, and recognizes fenced code, pipe tables, and
+  unordered lists, falling back to plain text otherwise.
+- `parseInline` is the inline phase: a character scanner over one line with a
+  `Pos` cursor and a `flushText` lambda, dispatching on `` ` ``, `*`, `_`.
+- Emphasis/strong closing is found by `findClosingDelim`, a recursive forward
+  search, documented as a simplified subset of the CommonMark flanking rules.
+  Code spans already use matching backtick-run length via `countRun`.
+
+What maps cleanly, in rough priority order:
+
+1. **Make the inline pass a real second pass over all text-bearing nodes.**
+   Today `parseInline` is called only on the plain-text fallback line, so table
+   cells and list item text never get inline parsing. cmark, pulldown, and
+   goldmark all run inline parsing as a distinct phase over every block that
+   contains text. The lightweight version for us: after the block loop builds the
+   node list, walk it once and run `parseInline` on each text-bearing node
+   (paragraph text, list item text, and eventually table cells). This removes the
+   "inline only happens for loose paragraph lines" gap and matches the TODO
+   already noted on the table rows loop. It does not require a full tree rewrite,
+   just a second traversal of what `parseMarkdown` already produces.
+
+2. **Adopt the delimiter-stack emphasis algorithm if we want conformance.**
+   This is the single biggest structural difference. Our `findClosingDelim` is a
+   forward search that cannot resolve the general overlap and nesting cases
+   (triple runs like `***x***`, runs of differing length, interleaved `*`/`_`).
+   All three references implement the same standard algorithm: scan a run,
+   compute `can_open`/`can_close` from flanking, push onto a stack, then walk the
+   stack backward pairing closers to openers with the length-mod-3 opener
+   constraint. If emphasis conformance becomes a goal, this is the known-correct
+   shape to port, and our current code already isolates the flanking check
+   (`isSpace` neighbor tests) that would feed `can_open`/`can_close`.
+
+3. **Trigger-style dispatch when the inline grammar grows.** Our if-chain on
+   `` ` `` `*` `_` is fine for three constructs. If we add links, autolinks, or
+   entities, goldmark's trigger table (dispatch keyed by the current byte) or
+   cmark's switch keeps the scanner flat instead of a growing if-ladder. Not
+   needed yet; worth keeping in mind so the scanner does not accrete nested ifs.
+
+4. **Bulk text-run scanning.** cmark's `subject_find_special_char` jumps to the
+   next special byte and emits one text node, rather than advancing one byte at a
+   time. Our scanner advances character by character in the default case. A
+   `find_first_of("`*_")` style jump would cut per-character work on long plain
+   runs. Minor for doc-comment-sized input, but a cheap, local improvement.
+
+5. **Source spans vs copying (consider, do not rush).** pulldown stores byte
+   offsets into the original text instead of copying strings; cmark frees raw
+   buffers after the inline pass. We copy via `StringSaver`. For comment-sized
+   input the copy cost is negligible, and our source text is assembled from
+   concatenated comment fragments that are not guaranteed contiguous or stable,
+   so spans are not a clean fit today. Note it only as a future memory option if
+   the parser ever runs over large contiguous inputs.
+
+### Suggested sequencing
+- Near term: item 1 (deferred inline pass over text-bearing nodes) is the
+  highest-value, lowest-risk change and directly closes the table-cell and
+  list-item inline gap.
+- Medium term: item 2 (delimiter stack) only if CommonMark emphasis conformance
+  is prioritized over the current pragmatic subset.
+- Opportunistic: items 3 and 4 as the inline grammar and input sizes grow.
+
+### One caveat
+The three references are full CommonMark implementations. Our parser
+deliberately targets a doc-comment subset (the existing TODOs say as much), so
+the goal is to borrow structure (the two-phase split, the delimiter-stack shape,
+trigger dispatch) rather than match feature for feature. The most defensible next
+step is structural: turn inline parsing into a real second pass so every
+text-bearing node is handled uniformly, which is the one design choice all three
+parsers share and the one our current code most clearly diverges from.
diff --git a/followed b/followed
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/prefix b/prefix
new file mode 100644
index 0000000000000..e69de29bb2d1d