[clang-tools-extra] [llvm] [llvm] add support for mustache templating language (PR #105893)
Paul Kirth via cfe-commits
cfe-commits at lists.llvm.org
Fri Nov 15 13:14:53 PST 2024
================
@@ -0,0 +1,724 @@
+//===-- Mustache.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Mustache.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+using namespace llvm;
+using namespace llvm::json;
+
+namespace llvm {
+namespace mustache {
+
+class Token {
+public:
+ enum class Type {
+ Text,
+ Variable,
+ Partial,
+ SectionOpen,
+ SectionClose,
+ InvertSectionOpen,
+ UnescapeVariable,
+ Comment,
+ };
+
+ Token(StringRef Str);
+
+ Token(StringRef RawBody, StringRef Str, char Identifier);
+
+ StringRef getTokenBody() const { return TokenBody; };
+
+ StringRef getRawBody() const { return RawBody; };
+
+ void setTokenBody(StringRef NewBody) { TokenBody = NewBody.str(); };
+
+ Accessor getAccessor() const { return Accessor; };
+
+ Type getType() const { return TokenType; };
+
+ void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
+
+ size_t getIndentation() const { return Indentation; };
+
+ static Type getTokenType(char Identifier);
+
+private:
+ size_t Indentation;
+ Type TokenType;
+ // RawBody is the original string that was tokenized
+ std::string RawBody;
+ Accessor Accessor;
+ // TokenBody is the original string with the identifier removed
+ std::string TokenBody;
+};
+
+class ASTNode {
+public:
+ enum Type {
+ Root,
+ Text,
+ Partial,
+ Variable,
+ UnescapeVariable,
+ Section,
+ InvertSection,
+ };
+
+ ASTNode() : T(Type::Root), ParentContext(nullptr) {};
+
+ ASTNode(StringRef Body, ASTNode *Parent)
+ : T(Type::Text), Body(Body), Parent(Parent), ParentContext(nullptr) {};
+
+ // Constructor for Section/InvertSection/Variable/UnescapeVariable
+ ASTNode(Type T, Accessor Accessor, ASTNode *Parent)
+ : T(T), Parent(Parent), Children({}), Accessor(Accessor),
+ ParentContext(nullptr) {};
+
+ void addChild(ASTNode *Child) { Children.emplace_back(Child); };
+
+ void setBody(StringRef NewBody) { Body = NewBody; };
+
+ void setRawBody(StringRef NewBody) { RawBody = NewBody; };
+
+ void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
+
+ void render(const llvm::json::Value &Data, llvm::raw_ostream &OS);
+
+ void setUpNode(llvm::BumpPtrAllocator &Alloc, StringMap<ASTNode *> &Partials,
+ StringMap<Lambda> &Lambdas,
+ StringMap<SectionLambda> &SectionLambdas,
+ DenseMap<char, StringRef> &Escapes);
+
+private:
+ void renderLambdas(const llvm::json::Value &Contexts, llvm::raw_ostream &OS,
+ Lambda &L);
+
+ void renderSectionLambdas(const llvm::json::Value &Contexts,
+ llvm::raw_ostream &OS, SectionLambda &L);
+
+ void renderPartial(const llvm::json::Value &Contexts, llvm::raw_ostream &OS,
+ ASTNode *Partial);
+
+ void renderChild(const llvm::json::Value &Context, llvm::raw_ostream &OS);
+
+ const llvm::json::Value *findContext();
+
+ llvm::BumpPtrAllocator *Allocator;
+ StringMap<ASTNode *> *Partials;
+ StringMap<Lambda> *Lambdas;
+ StringMap<SectionLambda> *SectionLambdas;
+ DenseMap<char, StringRef> *Escapes;
+ Type T;
+ size_t Indentation = 0;
+ std::string RawBody;
+ std::string Body;
+ ASTNode *Parent;
+ // TODO: switch implementation to SmallVector<T>
+ std::vector<ASTNode *> Children;
+ const Accessor Accessor;
+ const llvm::json::Value *ParentContext;
+};
+
+// Custom stream to escape strings
+class EscapeStringStream : public raw_ostream {
+public:
+ explicit EscapeStringStream(llvm::raw_ostream &WrappedStream,
+ DenseMap<char, StringRef> &Escape)
+ : Escape(Escape), WrappedStream(WrappedStream) {
+ SetUnbuffered();
+ }
+
+protected:
+ void write_impl(const char *Ptr, size_t Size) override {
+ llvm::StringRef Data(Ptr, Size);
+ for (char C : Data) {
+ auto It = Escape.find(C);
+ if (It != Escape.end())
+ WrappedStream << It->getSecond();
+ else
+ WrappedStream << C;
+ }
+ }
+
+ uint64_t current_pos() const override { return WrappedStream.tell(); }
+
+private:
+ DenseMap<char, StringRef> &Escape;
+ llvm::raw_ostream &WrappedStream;
+};
+
+// Custom stream to add indentation used to for rendering partials
+class AddIndentationStringStream : public raw_ostream {
+public:
+ explicit AddIndentationStringStream(llvm::raw_ostream &WrappedStream,
+ size_t Indentation)
+ : Indentation(Indentation), WrappedStream(WrappedStream) {
+ SetUnbuffered();
+ }
+
+protected:
+ void write_impl(const char *Ptr, size_t Size) override {
+ llvm::StringRef Data(Ptr, Size);
+ std::string Indent(Indentation, ' ');
+ for (char C : Data) {
+ WrappedStream << C;
+ if (C == '\n')
+ WrappedStream << Indent;
+ }
+ }
+
+ uint64_t current_pos() const override { return WrappedStream.tell(); }
+
+private:
+ size_t Indentation;
+ llvm::raw_ostream &WrappedStream;
+};
+
+Accessor split(StringRef Str, char Delimiter) {
+ Accessor Tokens;
+ if (Str == ".") {
+ Tokens.emplace_back(Str);
+ return Tokens;
+ }
+ StringRef Ref(Str);
+ while (!Ref.empty()) {
+ StringRef Part;
+ std::tie(Part, Ref) = Ref.split(Delimiter);
+ Tokens.emplace_back(Part.trim());
+ }
+ return Tokens;
+}
+
+Token::Token(StringRef RawBody, StringRef InnerBody, char Identifier)
+ : RawBody(RawBody), TokenBody(InnerBody), Indentation(0) {
+ TokenType = getTokenType(Identifier);
+ if (TokenType == Type::Comment)
+ return;
+
+ StringRef AccessorStr =
+ TokenType == Type::Variable ? InnerBody : InnerBody.substr(1);
+
+ Accessor = split(AccessorStr.trim(), '.');
+}
+
+Token::Token(StringRef Str)
+ : TokenType(Type::Text), RawBody(Str), Accessor({}), TokenBody(Str),
+ Indentation(0) {}
+
+Token::Type Token::getTokenType(char Identifier) {
+ switch (Identifier) {
+ case '#':
+ return Type::SectionOpen;
+ case '/':
+ return Type::SectionClose;
+ case '^':
+ return Type::InvertSectionOpen;
+ case '!':
+ return Type::Comment;
+ case '>':
+ return Type::Partial;
+ case '&':
+ return Type::UnescapeVariable;
+ default:
+ return Type::Variable;
+ }
+}
+
+// Function to check if there is meaningful text behind.
+// We determine if a token has meaningful text behind
+// if the right of previous token contains anything that is
+// not a newline
+// For example: "Stuff {{#Section}}"
+// We make an exception for when previous token is empty
+// and the current token is the second token
+// For example: "{{#Section}}"
+bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
+ if (Idx == 0)
+ return true;
+
+ int PrevIdx = Idx - 1;
+ if (Tokens[PrevIdx].getType() != Token::Type::Text)
+ return true;
+
+ const Token &PrevToken = Tokens[Idx - 1];
+ StringRef TokenBody = PrevToken.getRawBody().rtrim(" \t\v");
+ return !TokenBody.ends_with("\n") && !(TokenBody.empty() && Idx == 1);
+}
+
+// Function to check if there's no meaningful text ahead
+// We determine if a token has no meaningful text behind
+// if the left of previous token is empty spaces or tabs followed
+// by a newline
+// For example: "{{#Section}} \n"
+bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
+ if (Idx >= Tokens.size() - 1)
+ return true;
+
+ int NextIdx = Idx + 1;
+ if (Tokens[NextIdx].getType() != Token::Type::Text)
+ return true;
+
+ const Token &NextToken = Tokens[Idx + 1];
+ StringRef TokenBody = NextToken.getRawBody().ltrim(" ");
+ return !TokenBody.starts_with("\r\n") && !TokenBody.starts_with("\n");
+}
+
+bool requiresCleanUp(Token::Type T) {
+ // We must clean up all the tokens that could contain child nodes
+ return T == Token::Type::SectionOpen || T == Token::Type::InvertSectionOpen ||
+ T == Token::Type::SectionClose || T == Token::Type::Comment ||
+ T == Token::Type::Partial;
+}
+
+// Adjust next token body if there is no text ahead
+// For example:
+// The template string
+// "{{! Comment }} \nLine 2"
+// would be considered as no text ahead and should be render as
+// " Line 2"
+void stripTokenAhead(SmallVector<Token> &Tokens, size_t Idx) {
+ Token &NextToken = Tokens[Idx + 1];
+ StringRef NextTokenBody = NextToken.getTokenBody();
+ // cut off the leading newline which could be \n or \r\n
+ if (NextTokenBody.starts_with("\r\n"))
+ NextToken.setTokenBody(NextTokenBody.substr(2));
+ else if (NextTokenBody.starts_with("\n"))
+ NextToken.setTokenBody(NextTokenBody.substr(1));
+}
+
+// Adjust previous token body if there no text behind
+// For example:
+// The template string
+// " \t{{#section}}A{{/section}}"
+// would be considered as having no text ahead and would be render as
+// "A"
+// The exception for this is partial tag which requires us to
+// keep track of the indentation once it's rendered.
+void stripTokenBefore(SmallVector<Token> &Tokens, size_t Idx,
+ Token &CurrentToken, Token::Type CurrentType) {
+ Token &PrevToken = Tokens[Idx - 1];
+ StringRef PrevTokenBody = PrevToken.getTokenBody();
+ StringRef Unindented = PrevTokenBody.rtrim(" \t\v");
+ size_t Indentation = PrevTokenBody.size() - Unindented.size();
+ if (CurrentType != Token::Type::Partial)
+ PrevToken.setTokenBody(Unindented);
+ CurrentToken.setIndentation(Indentation);
+}
+
+// Simple tokenizer that splits the template into tokens.
+// The mustache spec allows {{{ }}} to unescape variables
+// but we don't support that here. An unescape variable
+// is represented only by {{& variable}}.
+SmallVector<Token> tokenize(StringRef Template) {
+ SmallVector<Token> Tokens;
+ std::string Open("{{");
+ std::string Close("}}");
+ size_t Start = 0;
+ size_t DelimiterStart = Template.find(Open);
+ if (DelimiterStart == StringRef::npos) {
+ Tokens.emplace_back(Template);
+ return Tokens;
+ }
+ while (DelimiterStart != StringRef::npos) {
+ if (DelimiterStart != Start) {
+ Tokens.emplace_back(Template.substr(Start, DelimiterStart - Start));
+ }
+
+ size_t DelimiterEnd = Template.find(Close, DelimiterStart);
+ if (DelimiterEnd == StringRef::npos) {
+ break;
+ }
+
+ // Extract the Interpolated variable without delimiters {{ and }}
+ size_t InterpolatedStart = DelimiterStart + Open.size();
+ size_t InterpolatedEnd = DelimiterEnd - DelimiterStart - Close.size();
+ std::string Interpolated =
+ Template.substr(InterpolatedStart, InterpolatedEnd).str();
+ std::string RawBody = Open + Interpolated + Close;
+ Tokens.emplace_back(RawBody, Interpolated, Interpolated[0]);
+ Start = DelimiterEnd + Close.size();
+ DelimiterStart = Template.find(Open, Start);
+ }
+
+ if (Start < Template.size())
+ Tokens.emplace_back(Template.substr(Start));
+
+ // Fix up white spaces for:
+ // - open sections
+ // - inverted sections
+ // - close sections
+ // - comments
+ //
+ // This loop attempts to find standalone tokens and tries to trim out
+ // the surrounding whitespace.
+ // For example:
+ // if you have the template string
+ // {{#section}} \n Example \n{{/section}}
+ // The output should would be
+ // For example
+ // \n Example \n
+ size_t LastIdx = Tokens.size() - 1;
+ for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
+ Token &CurrentToken = Tokens[Idx];
+ Token::Type CurrentType = CurrentToken.getType();
+ // Check if token type requires cleanup
+ bool RequiresCleanUp = requiresCleanUp(CurrentType);
+
+ if (!RequiresCleanUp)
+ continue;
+
+ // We adjust the token body if there's no text behind or ahead.
+ // A token is considered to have no text ahead if the right of the previous
+ // token is a newline followed by spaces.
+ // A token is considered to have no text behind if the left of the next
+ // token is spaces followed by a newline.
+ // eg.
+ // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
+ bool HasTextBehind = hasTextBehind(Idx, Tokens);
+ bool HasTextAhead = hasTextAhead(Idx, Tokens);
+
+ if ((!HasTextAhead && !HasTextBehind) || (!HasTextAhead && Idx == 0))
+ stripTokenAhead(Tokens, Idx);
+
+ if ((!HasTextBehind && !HasTextAhead) || (!HasTextBehind && Idx == LastIdx))
+ stripTokenBefore(Tokens, Idx, CurrentToken, CurrentType);
+ }
+ return Tokens;
+}
+
+class Parser {
+public:
+ Parser(StringRef TemplateStr, BumpPtrAllocator &Allocator)
+ : Allocator(Allocator), TemplateStr(TemplateStr) {}
+
+ ASTNode *parse();
+
+private:
+ void parseMustache(ASTNode *Parent);
+
+ BumpPtrAllocator &Allocator;
+ SmallVector<Token> Tokens;
+ size_t CurrentPtr;
+ StringRef TemplateStr;
+};
+
+ASTNode *Parser::parse() {
+ Tokens = tokenize(TemplateStr);
+ CurrentPtr = 0;
+ void *Root = Allocator.Allocate(sizeof(ASTNode), alignof(ASTNode));
+ ASTNode *RootNode = new (Root) ASTNode();
+ parseMustache(RootNode);
+ return RootNode;
+}
+
+void Parser::parseMustache(ASTNode *Parent) {
+
+ while (CurrentPtr < Tokens.size()) {
+ Token CurrentToken = Tokens[CurrentPtr];
+ CurrentPtr++;
+ Accessor A = CurrentToken.getAccessor();
+ ASTNode *CurrentNode;
+ void *Node = Allocator.Allocate(sizeof(ASTNode), alignof(ASTNode));
+
+ switch (CurrentToken.getType()) {
+ case Token::Type::Text: {
+ CurrentNode = new (Node) ASTNode(CurrentToken.getTokenBody(), Parent);
+ Parent->addChild(CurrentNode);
+ break;
+ }
+ case Token::Type::Variable: {
+ CurrentNode = new (Node) ASTNode(ASTNode::Variable, A, Parent);
+ Parent->addChild(CurrentNode);
+ break;
+ }
+ case Token::Type::UnescapeVariable: {
+ CurrentNode = new (Node) ASTNode(ASTNode::UnescapeVariable, A, Parent);
+ Parent->addChild(CurrentNode);
+ break;
+ }
+ case Token::Type::Partial: {
+ CurrentNode = new (Node) ASTNode(ASTNode::Partial, A, Parent);
+ CurrentNode->setIndentation(CurrentToken.getIndentation());
+ Parent->addChild(CurrentNode);
+ break;
+ }
+ case Token::Type::SectionOpen: {
+ CurrentNode = new (Node) ASTNode(ASTNode::Section, A, Parent);
+ size_t Start = CurrentPtr;
+ parseMustache(CurrentNode);
+ size_t End = CurrentPtr;
+ std::string RawBody;
+ for (std::size_t I = Start; I < End - 1; I++)
+ RawBody += Tokens[I].getRawBody();
+ CurrentNode->setRawBody(RawBody);
+ Parent->addChild(CurrentNode);
+ break;
+ }
+ case Token::Type::InvertSectionOpen: {
+ CurrentNode = new (Node) ASTNode(ASTNode::InvertSection, A, Parent);
+ size_t Start = CurrentPtr;
+ parseMustache(CurrentNode);
+ size_t End = CurrentPtr;
+ std::string RawBody;
+ for (size_t Idx = Start; Idx < End - 1; Idx++)
+ RawBody += Tokens[Idx].getRawBody();
+ CurrentNode->setRawBody(RawBody);
+ Parent->addChild(CurrentNode);
+ break;
+ }
+ case Token::Type::Comment:
+ break;
+ case Token::Type::SectionClose:
+ return;
+ }
+ }
+}
+
+void Template::render(Value &Data, llvm::raw_ostream &OS) {
+ Tree->render(Data, OS);
+ LocalAllocator.Reset();
+}
+
+void Template::registerPartial(StringRef Name, StringRef Partial) {
+ Parser P = Parser(Partial, Allocator);
+ ASTNode *PartialTree = P.parse();
+ PartialTree->setUpNode(LocalAllocator, Partials, Lambdas, SectionLambdas,
+ Escapes);
+ Partials.insert(std::make_pair(Name, PartialTree));
+}
+
+void Template::registerLambda(StringRef Name, Lambda L) { Lambdas[Name] = L; }
+
+void Template::registerLambda(StringRef Name, SectionLambda L) {
+ SectionLambdas[Name] = L;
+}
+
+void Template::registerEscape(DenseMap<char, StringRef> E) { Escapes = E; }
+
+Template::Template(StringRef TemplateStr) {
+ Parser P = Parser(TemplateStr, Allocator);
+ Tree = P.parse();
+ // the default behaviour is to escape html entities
+ DenseMap<char, StringRef> HtmlEntities = {{'&', "&"},
+ {'<', "<"},
+ {'>', ">"},
+ {'"', """},
+ {'\'', "'"}};
+ registerEscape(HtmlEntities);
+ Tree->setUpNode(LocalAllocator, Partials, Lambdas, SectionLambdas, Escapes);
+}
+
+void toJsonString(const Value &Data, raw_ostream &OS) {
+ switch (Data.kind()) {
+ case Value::Null:
+ return;
+ case Value::Number: {
+ auto Num = *Data.getAsNumber();
+ std::ostringstream Oss;
+ Oss << Num;
+ OS << Oss.str();
+ return;
+ }
+ case Value::String: {
+ auto Str = *Data.getAsString();
+ OS << Str.str();
+ return;
+ }
+ case Value::Array: {
+ auto Arr = *Data.getAsArray();
+ if (Arr.empty())
+ return;
+ }
+ case Value::Object:
+ case Value::Boolean: {
+ llvm::json::OStream JOS(OS, 2);
+ JOS.value(Data);
+ break;
+ }
+ }
----------------
ilovepi wrote:
The formatting here looks a bit odd.
https://github.com/llvm/llvm-project/pull/105893
More information about the cfe-commits
mailing list