[clang-tools-extra] [clang-doc] Add standalone Markdown parsing library (PR #202991)
Erick Velez via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 12 00:58:52 PDT 2026
================
@@ -0,0 +1,274 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Standalone Markdown parsing library for the LLVM ecosystem.
+///
+/// The parser takes plain paragraph text and returns a polymorphic tree of
+/// MDNode-derived objects allocated in a caller-supplied BumpPtrAllocator.
+/// Node types form a closed class hierarchy rooted at MDNode. Each concrete
+/// type carries exactly the fields it needs -- no overloaded Content field,
+/// no unused arrays. Use llvm::isa<>/cast<>/dyn_cast<> for type-safe
+/// downcasting; each concrete type provides classof() for this purpose.
+///
+/// See
+/// https://llvm.org/docs/ProgrammerManual.html#the-isa-cast-and-dyn-cast-templates
+///
+/// Field ordering in each derived struct is chosen to minimize padding:
+/// 4-byte fields (like Level or Start) are declared before 16-byte fields
+/// (ArrayRef, StringRef) so that no implicit padding is inserted between the
+/// base class's 4-byte Kind and the first derived field.
+///
+/// Inline nodes (appear inside ParagraphNode, HeadingNode, etc.):
+/// TextNode -- plain text run
+/// SoftBreakNode -- soft line break
+/// HardBreakNode -- hard line break (trailing spaces or backslash)
+/// InlineCodeNode -- inline code span (`code`)
+/// EmphasisNode -- emphasis (*text* or _text_)
+/// StrongNode -- strong emphasis (**text** or __text__)
+///
+/// Block nodes:
+/// ParagraphNode -- sequence of inline nodes
+/// HeadingNode -- ATX heading (# through ######), level 1-6
+/// FencedCodeNode -- fenced code block (``` or ~~~)
+/// TableNode -- pipe table (raw row text; TODO: structured cells)
+/// UnorderedListNode -- bullet list (-, *, +)
+/// OrderedListNode -- numbered list with explicit start number
+/// ListItemNode -- single item inside a list
+/// BlockQuoteNode -- block quote (>)
+/// ThematicBreakNode -- horizontal rule (---, ***, ___)
+///
+/// All nodes are arena-allocated. The caller owns the arena and must keep it
+/// alive for the lifetime of any returned nodes. The parser never crashes on
+/// malformed input; unrecognized text falls back to TextNode.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_SUPPORT_MARKDOWN_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+
+namespace clang::doc::markdown {
+
+/// Discriminator for all Markdown AST nodes. Inline kinds are grouped before
+/// block kinds so that the sentinels NK_LastInline and NK_FirstBlock enable
+/// cheap range-based checks in classof() implementations.
+enum class NodeKind {
+ // Inline nodes
+ NK_Text,
+ NK_SoftBreak,
+ NK_HardBreak,
+ NK_InlineCode,
+ NK_Emphasis,
+ NK_Strong,
+ NK_LastInline = NK_Strong, // sentinel -- all inline kinds are <= this
+
+ // Block nodes
+ NK_Paragraph,
+ NK_Heading,
+ NK_FencedCode,
+ NK_Table,
+ NK_UnorderedList,
+ NK_OrderedList,
+ NK_ListItem,
+ NK_BlockQuote,
+ NK_ThematicBreak,
+ NK_FirstBlock = NK_Paragraph, // sentinel -- all block kinds are >= this
+};
+
+/// Base type for all Markdown AST nodes. Carries only the kind discriminator.
+/// Nodes are arena-allocated and have no virtual destructor; use
+/// llvm::isa<>/cast<>/dyn_cast<> for type-safe downcasting.
+struct MDNode {
+ NodeKind Kind;
+ explicit MDNode(NodeKind K) : Kind(K) {}
+};
+
+//===----------------------------------------------------------------------===//
+// Inline nodes
+//===----------------------------------------------------------------------===//
+
+/// Plain text run.
+struct TextNode : MDNode {
+ llvm::StringRef Text;
+ explicit TextNode(llvm::StringRef Text)
+ : MDNode(NodeKind::NK_Text), Text(Text) {}
+ static bool classof(const MDNode *N) { return N->Kind == NodeKind::NK_Text; }
+};
+
+/// Soft line break -- a newline that does not end the paragraph.
+struct SoftBreakNode : MDNode {
+ SoftBreakNode() : MDNode(NodeKind::NK_SoftBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_SoftBreak;
+ }
+};
+
+/// Hard line break -- two trailing spaces or a backslash before a newline.
+struct HardBreakNode : MDNode {
+ HardBreakNode() : MDNode(NodeKind::NK_HardBreak) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_HardBreak;
+ }
+};
+
+/// Inline code span: `code`. Code does not include the surrounding backticks.
+struct InlineCodeNode : MDNode {
+ llvm::StringRef Code;
+ explicit InlineCodeNode(llvm::StringRef Code)
+ : MDNode(NodeKind::NK_InlineCode), Code(Code) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_InlineCode;
+ }
+};
+
+/// Emphasized text: *text* or _text_.
+struct EmphasisNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit EmphasisNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Emphasis), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Emphasis;
+ }
+};
+
+/// Strongly emphasized text: **text** or __text__.
+struct StrongNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit StrongNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Strong), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Strong;
+ }
+};
+
+//===----------------------------------------------------------------------===//
+// Block nodes
+//===----------------------------------------------------------------------===//
+
+/// A paragraph -- sequence of inline nodes separated from other blocks by
+/// blank lines.
+struct ParagraphNode : MDNode {
+ llvm::ArrayRef<MDNode *> Children;
+ explicit ParagraphNode(llvm::ArrayRef<MDNode *> Children)
+ : MDNode(NodeKind::NK_Paragraph), Children(Children) {}
+ static bool classof(const MDNode *N) {
+ return N->Kind == NodeKind::NK_Paragraph;
+ }
+};
+
+/// ATX heading: one to six leading # characters. Level is declared before
+/// Children to avoid padding between the base class's 4-byte Kind and the
+/// 8-byte-aligned ArrayRef, keeping sizeof(HeadingNode) at 24 bytes.
----------------
evelez7 wrote:
Don't think you need to specify padding or alignment here.
https://github.com/llvm/llvm-project/pull/202991
More information about the cfe-commits
mailing list