[clang-tools-extra] [clang-doc] add Markdown parser (PR #155887)
Paul Kirth via cfe-commits
cfe-commits at lists.llvm.org
Fri Aug 29 13:55:08 PDT 2025
================
@@ -0,0 +1,99 @@
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MD_PARSER_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_DOC_MD_PARSER_H
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/StringSaver.h"
+#include <list>
+
+using namespace llvm;
+
+namespace clang {
+namespace doc {
+using llvm::SmallString;
+enum class MDState { Emphasis, Strong, None };
+
+enum class MDType {
+ Paragraph,
+ Emphasis,
+ Strong,
+ Text,
+ Softbreak,
+};
+
+enum class MDTokenType { LeftDelimiterRun, RightDelimiterRun, Text };
+
+struct Node {
+ SmallVector<Node*> Children;
+ MDType Type;
+ Node *Parent;
+ std::string Content;
+};
+
+struct DelimiterContext {
+ bool RightFlanking;
+ bool LeftFlanking;
+ bool CanOpen;
+ bool CanClose;
+ char DelimChar;
+ // Since Content is a StringRef, we separately track the length so that we can
+ // decrement when necessary without modifying the string.
+ size_t Length;
+};
+
+/// A LineNode might be a valid delimiter run, text, or a delimiter run that
+/// will later be merged with a text if there is no matching run e.g. ***foo.
+/// @brief A preprocessing structure for tracking text in a line.
+struct LineNode {
+ StringRef Content;
+ // Instantiated if the line is a delimiter run.
+ std::optional<DelimiterContext> DelimiterContext;
+};
+
+class MarkdownParser {
+ // MDState State;
+ BumpPtrAllocator Arena;
+ StringSaver Saver;
+
+ /// If a delimiter is found, determine if it is a delimiter run, what type of
+ /// run it is, and whether it can be an opener or closer.
+ ///
+ /// The CommonMark specification defines delimiter runs as:
+ /// A delimiter run is either a sequence of one or more * or _ characters that
+ /// is not preceded or followed by a non-backslash-escaped * or _ character
+ ///
+ /// A left-flanking delimiter run is a delimiter run that is (1) not followed
+ /// by Unicode whitespace, and either (2a) not followed by a Unicode
+ /// punctuation character, or (2b) followed by a Unicode punctuation character
+ /// and preceded by Unicode whitespace or a Unicode punctuation character.
+ ///
+ /// A right-flanking delimiter run is a delimiter run that is (1) not preceded
+ /// by Unicode whitespace, and either (2a) not preceded by a Unicode
+ /// punctuation character, or (2b) preceded by a Unicode punctuation character
+ /// and followed by Unicode whitespace or a Unicode punctuation character.
+ ///
+ /// @param IdxOrigin the index of * or _ that might start a delimiter run.
+ /// @return A pair denoting the type of run and the index where the run stops
+ std::pair<std::optional<DelimiterContext>, size_t>
+ processDelimiters(SmallString<64> &Line, const size_t &Origin = 0);
----------------
ilovepi wrote:
```suggestion
/// @param Origin the index of * or _ that might start a delimiter run.
/// @return A pair denoting the type of run and the index where the run stops
std::pair<std::optional<DelimiterContext>, size_t>
processDelimiters(SmallString<64> &Line, const size_t &Origin = 0);
```
The doc name seems stale. I do wonder if you should just use `Start` or `StartIdx`, though.
https://github.com/llvm/llvm-project/pull/155887
More information about the cfe-commits
mailing list