[clang-tools-extra] [clang-doc] Add standalone Markdown parsing library (PR #202991)
Erick Velez via cfe-commits
cfe-commits at lists.llvm.org
Fri Jun 12 00:55:59 PDT 2026
================
@@ -0,0 +1,297 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "Markdown.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/DebugLog.h"
+#include <cassert>
+
+#define DEBUG_TYPE "clang-doc"
+
+using namespace llvm;
+
+namespace clang::doc::markdown {
+
+// Allocates a contiguous array of T in the arena and returns an ArrayRef.
+template <typename T>
+static ArrayRef<T> allocateArray(SmallVectorImpl<T> &Vec,
+ BumpPtrAllocator &Arena) {
+ if (Vec.empty())
+ return {};
+ T *Allocated = Arena.Allocate<T>(Vec.size());
+ std::uninitialized_copy(Vec.begin(), Vec.end(), Allocated);
+ return ArrayRef<T>(Allocated, Vec.size());
+}
+
+// Interns a StringRef into the arena so it outlives the parse loop.
+static StringRef internString(StringRef S, BumpPtrAllocator &Arena) {
+ if (S.empty())
+ return {};
+ char *Buf = Arena.Allocate<char>(S.size());
+ std::copy(S.begin(), S.end(), Buf);
+ return StringRef(Buf, S.size());
+}
+
+// A line is a table separator if it only contains |, -, :, and spaces,
+// and has at least one -.
+static bool isSepRow(StringRef Line) {
+ return Line.contains('-') &&
+ Line.find_first_not_of("|-: ") == StringRef::npos;
+}
+
+// Returns true if Line begins with a bullet list marker (-, *, or +)
+// followed by a space.
+static bool isListItem(StringRef Line) {
+ return Line.starts_with("- ") || Line.starts_with("* ") ||
+ Line.starts_with("+ ");
+}
+
+// A forward cursor over the lines of a paragraph. Encapsulates the parse
+// position so the loop can inspect the current or an upcoming line and consume
+// lines without manual index arithmetic. Lines are stored untrimmed; callers
+// trim where they need a normalized view.
+class LineReader {
+public:
+ explicit LineReader(ArrayRef<StringRef> Lines) : Lines(Lines) {}
+
+ // True once every line has been consumed.
+ bool atEnd() const { return Pos >= Lines.size(); }
+
+ // The current line, untrimmed. Must not be called when atEnd().
+ StringRef peek() const {
+ assert(!atEnd() && "peek past end of input");
+ return Lines[Pos];
+ }
+
+ // The line Offset positions ahead of the cursor, or an empty StringRef when
+ // that position is past the end. peek(0) is the current line.
+ StringRef peek(size_t Offset) const {
+ size_t Target = Pos + Offset;
+ return Target < Lines.size() ? Lines[Target] : StringRef();
+ }
+
+ // Consume the current line and return it, untrimmed. Must not be called when
+ // atEnd().
+ StringRef advance() {
+ assert(!atEnd() && "advance past end of input");
+ return Lines[Pos++];
+ }
+
+private:
+ ArrayRef<StringRef> Lines;
+ size_t Pos = 0;
+};
+
+// Returns the number of consecutive copies of C starting at S[Start].
+static size_t countRun(StringRef S, size_t Start, char C) {
+ size_t I = Start;
+ while (I < S.size() && S[I] == C)
+ ++I;
+ return I - Start;
+}
+
+// Strips one leading and one trailing space from a code span's content when
+// both are present and the content is not all spaces, per CommonMark §6.1.
+static StringRef trimCodeSpan(StringRef Code) {
+ if (Code.size() >= 2 && Code.front() == ' ' && Code.back() == ' ' &&
+ Code.find_first_not_of(' ') != StringRef::npos)
+ return Code.drop_front().drop_back();
+ return Code;
+}
+
+// Finds the start index of a closing emphasis run of exactly Count copies of C,
+// searching forward from From. Requires non-whitespace immediately inside both
+// the opening and closing delimiters and non-empty content, a simplified take
+// on the CommonMark §6.2 flanking rules. Returns StringRef::npos if no valid
+// closing run exists.
+static size_t findClosingDelim(StringRef S, size_t From, char C, size_t Count) {
+ size_t E = S.size();
+ // Opening delimiter is not left-flanking if whitespace follows it.
+ if (From >= E || isSpace(S[From]))
+ return StringRef::npos;
+ for (size_t J = From; J + Count <= E; ++J) {
+ if (S[J] != C)
+ continue;
+ size_t Run = countRun(S, J, C);
+ if (Run != Count) {
+ J += Run - 1; // Skip the whole run; the loop's ++J lands past it.
+ continue;
+ }
+ // Reject empty content and closing runs that are not right-flanking.
+ if (J == From || isSpace(S[J - 1]))
+ continue;
+ return J;
+ }
+ return StringRef::npos;
+}
+
+// Parses the inline content of a single line into a sequence of inline nodes:
+// inline code (`code`), strong (**text** or __text__), and emphasis (*text* or
+// _text_). Runs that match no construct become TextNodes. Emphasis and strong
+// recurse so their content may itself contain inline constructs. Text with no
+// markers yields a single TextNode.
+//
+// TODO: This covers the common cases but not the full CommonMark §6 inline
+// model (delimiter stacks, intraword underscore rules, links, autolinks).
+static ArrayRef<MDNode *> parseInline(StringRef S, BumpPtrAllocator &Arena) {
+ SmallVector<MDNode *> Nodes;
+ size_t TextStart = 0, I = 0, E = S.size();
+
+ auto flushText = [&](size_t End) {
+ if (End > TextStart)
+ Nodes.push_back(new (Arena) TextNode(
+ internString(S.substr(TextStart, End - TextStart), Arena)));
+ };
+
+ while (I < E) {
+ char C = S[I];
+
+ // Inline code span: a run of N backticks closed by a run of N backticks.
+ if (C == '`') {
+ size_t N = countRun(S, I, '`');
+ size_t J = I + N;
+ while (J < E && countRun(S, J, '`') != N)
+ J += S[J] == '`' ? countRun(S, J, '`') : 1;
+ if (J < E) {
+ flushText(I);
+ StringRef Code = trimCodeSpan(S.substr(I + N, J - (I + N)));
+ Nodes.push_back(new (Arena) InlineCodeNode(internString(Code, Arena)));
+ I = J + N;
+ TextStart = I;
+ continue;
+ }
+ // No closing run; leave the backticks as literal text.
+ I += N;
+ continue;
+ }
----------------
evelez7 wrote:
I think these variable names can be cleaned up. I understand that these are all just lengths and spans across text but `J < E` is pretty hard to figure out even if you have some context.
https://github.com/llvm/llvm-project/pull/202991
More information about the cfe-commits
mailing list