[clang-tools-extra] [llvm] [llvm] add support for mustache templating language (PR #105893)

Fri Nov 15 13:40:57 PST 2024

================
@@ -0,0 +1,724 @@
+//===-- Mustache.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Mustache.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+using namespace llvm;
+using namespace llvm::json;
+
+namespace llvm {
+namespace mustache {
+
+class Token {
+public:
+  enum class Type {
+    Text,
+    Variable,
+    Partial,
+    SectionOpen,
+    SectionClose,
+    InvertSectionOpen,
+    UnescapeVariable,
+    Comment,
+  };
+
+  Token(StringRef Str);
+
+  Token(StringRef RawBody, StringRef Str, char Identifier);
+
+  StringRef getTokenBody() const { return TokenBody; };
+
+  StringRef getRawBody() const { return RawBody; };
+
+  void setTokenBody(StringRef NewBody) { TokenBody = NewBody.str(); };
+
+  Accessor getAccessor() const { return Accessor; };
+
+  Type getType() const { return TokenType; };
+
+  void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
+
+  size_t getIndentation() const { return Indentation; };
+
+  static Type getTokenType(char Identifier);
+
+private:
+  size_t Indentation;
+  Type TokenType;
+  // RawBody is the original string that was tokenized
+  std::string RawBody;
+  Accessor Accessor;
+  // TokenBody is the original string with the identifier removed
+  std::string TokenBody;
+};
+
+class ASTNode {
+public:
+  enum Type {
+    Root,
+    Text,
+    Partial,
+    Variable,
+    UnescapeVariable,
+    Section,
+    InvertSection,
+  };
+
+  ASTNode() : T(Type::Root), ParentContext(nullptr) {};
+
+  ASTNode(StringRef Body, ASTNode *Parent)
+      : T(Type::Text), Body(Body), Parent(Parent), ParentContext(nullptr) {};
+
+  // Constructor for Section/InvertSection/Variable/UnescapeVariable
+  ASTNode(Type T, Accessor Accessor, ASTNode *Parent)
+      : T(T), Parent(Parent), Children({}), Accessor(Accessor),
+        ParentContext(nullptr) {};
+
+  void addChild(ASTNode *Child) { Children.emplace_back(Child); };
+
+  void setBody(StringRef NewBody) { Body = NewBody; };
+
+  void setRawBody(StringRef NewBody) { RawBody = NewBody; };
+
+  void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
+
+  void render(const llvm::json::Value &Data, llvm::raw_ostream &OS);
+
+  void setUpNode(llvm::BumpPtrAllocator &Alloc, StringMap<ASTNode *> &Partials,
+                 StringMap<Lambda> &Lambdas,
+                 StringMap<SectionLambda> &SectionLambdas,
+                 DenseMap<char, StringRef> &Escapes);
+
+private:
+  void renderLambdas(const llvm::json::Value &Contexts, llvm::raw_ostream &OS,
+                     Lambda &L);
+
+  void renderSectionLambdas(const llvm::json::Value &Contexts,
+                            llvm::raw_ostream &OS, SectionLambda &L);
+
+  void renderPartial(const llvm::json::Value &Contexts, llvm::raw_ostream &OS,
+                     ASTNode *Partial);
+
+  void renderChild(const llvm::json::Value &Context, llvm::raw_ostream &OS);
+
+  const llvm::json::Value *findContext();
+
+  llvm::BumpPtrAllocator *Allocator;
+  StringMap<ASTNode *> *Partials;
+  StringMap<Lambda> *Lambdas;
+  StringMap<SectionLambda> *SectionLambdas;
+  DenseMap<char, StringRef> *Escapes;
+  Type T;
+  size_t Indentation = 0;
+  std::string RawBody;
+  std::string Body;
+  ASTNode *Parent;
+  // TODO: switch implementation to SmallVector<T>
+  std::vector<ASTNode *> Children;
+  const Accessor Accessor;
+  const llvm::json::Value *ParentContext;
+};
+
+// Custom stream to escape strings
+class EscapeStringStream : public raw_ostream {
+public:
+  explicit EscapeStringStream(llvm::raw_ostream &WrappedStream,
+                              DenseMap<char, StringRef> &Escape)
+      : Escape(Escape), WrappedStream(WrappedStream) {
+    SetUnbuffered();
+  }
+
+protected:
+  void write_impl(const char *Ptr, size_t Size) override {
+    llvm::StringRef Data(Ptr, Size);
+    for (char C : Data) {
+      auto It = Escape.find(C);
+      if (It != Escape.end())
+        WrappedStream << It->getSecond();
+      else
+        WrappedStream << C;
+    }
+  }
+
+  uint64_t current_pos() const override { return WrappedStream.tell(); }
+
+private:
+  DenseMap<char, StringRef> &Escape;
+  llvm::raw_ostream &WrappedStream;
+};
+
+// Custom stream to add indentation used to for rendering partials
+class AddIndentationStringStream : public raw_ostream {
+public:
+  explicit AddIndentationStringStream(llvm::raw_ostream &WrappedStream,
+                                      size_t Indentation)
+      : Indentation(Indentation), WrappedStream(WrappedStream) {
+    SetUnbuffered();
+  }
+
+protected:
+  void write_impl(const char *Ptr, size_t Size) override {
+    llvm::StringRef Data(Ptr, Size);
+    std::string Indent(Indentation, ' ');
+    for (char C : Data) {
+      WrappedStream << C;
+      if (C == '\n')
+        WrappedStream << Indent;
+    }
+  }
+
+  uint64_t current_pos() const override { return WrappedStream.tell(); }
+
+private:
+  size_t Indentation;
+  llvm::raw_ostream &WrappedStream;
+};
+
+Accessor split(StringRef Str, char Delimiter) {
+  Accessor Tokens;
+  if (Str == ".") {
+    Tokens.emplace_back(Str);
+    return Tokens;
+  }
+  StringRef Ref(Str);
+  while (!Ref.empty()) {
+    StringRef Part;
+    std::tie(Part, Ref) = Ref.split(Delimiter);
+    Tokens.emplace_back(Part.trim());
+  }
+  return Tokens;
+}
+
+Token::Token(StringRef RawBody, StringRef InnerBody, char Identifier)
+    : RawBody(RawBody), TokenBody(InnerBody), Indentation(0) {
+  TokenType = getTokenType(Identifier);
+  if (TokenType == Type::Comment)
+    return;
+
+  StringRef AccessorStr =
+      TokenType == Type::Variable ? InnerBody : InnerBody.substr(1);
+
+  Accessor = split(AccessorStr.trim(), '.');
+}
+
+Token::Token(StringRef Str)
+    : TokenType(Type::Text), RawBody(Str), Accessor({}), TokenBody(Str),
+      Indentation(0) {}
+
+Token::Type Token::getTokenType(char Identifier) {
+  switch (Identifier) {
+  case '#':
+    return Type::SectionOpen;
+  case '/':
+    return Type::SectionClose;
+  case '^':
+    return Type::InvertSectionOpen;
+  case '!':
+    return Type::Comment;
+  case '>':
+    return Type::Partial;
+  case '&':
+    return Type::UnescapeVariable;
+  default:
+    return Type::Variable;
+  }
+}
+
+// Function to check if there is meaningful text behind.
+// We determine if a token has meaningful text behind
+// if the right of previous token contains anything that is
+// not a newline
+// For example: "Stuff {{#Section}}"
+// We make an exception for when previous token is empty
+// and the current token is the second token
+// For example: "{{#Section}}"
+static bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
+  if (Idx == 0)
+    return true;
+
+  int PrevIdx = Idx - 1;
+  if (Tokens[PrevIdx].getType() != Token::Type::Text)
+    return true;
+  
+  const Token &PrevToken = Tokens[Idx - 1];
+  StringRef TokenBody = PrevToken.getRawBody().rtrim(" \t\v");
+  return !TokenBody.ends_with("\n") && !(TokenBody.empty() && Idx == 1);
+}
+
+// Function to check if there's no meaningful text ahead
+// We determine if a token has no meaningful text behind
+// if the left of previous token is empty spaces or tabs followed
+// by a newline
+// For example: "{{#Section}}  \n"
+static bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
+  if (Idx >= Tokens.size() - 1)
+    return true;
+
+  int NextIdx = Idx + 1;
+  if (Tokens[NextIdx].getType() != Token::Type::Text)
+    return true;
+
+  const Token &NextToken = Tokens[Idx + 1];
+  StringRef TokenBody = NextToken.getRawBody().ltrim(" ");
+  return !TokenBody.starts_with("\r\n") && !TokenBody.starts_with("\n");
+}
+
+static bool requiresCleanUp(Token::Type T) {
+  // We must clean up all the tokens that could contain child nodes
+  return T == Token::Type::SectionOpen || T == Token::Type::InvertSectionOpen ||
+         T == Token::Type::SectionClose || T == Token::Type::Comment ||
+         T == Token::Type::Partial;
+}
+
+// Adjust next token body if there is no text ahead
+// For example:
+//  The template string
+//  "{{! Comment }} \nLine 2"
+// would be considered as no text ahead and should be render as
+//  " Line 2"
+static void stripTokenAhead(SmallVector<Token> &Tokens, size_t Idx) {
+  Token &NextToken = Tokens[Idx + 1];
+  StringRef NextTokenBody = NextToken.getTokenBody();
+  // cut off the leading newline which could be \n or \r\n
+  if (NextTokenBody.starts_with("\r\n"))
+    NextToken.setTokenBody(NextTokenBody.substr(2));
+  else if (NextTokenBody.starts_with("\n"))
+    NextToken.setTokenBody(NextTokenBody.substr(1));
+}
+
+// Adjust previous token body if there no text behind
+// For example:
+//  The template string
+//  " \t{{#section}}A{{/section}}"
+// would be considered as having no text ahead and would be render as
+//  "A"
+// The exception for this is partial tag which requires us to
+// keep track of the indentation once it's rendered.
+static void stripTokenBefore(SmallVector<Token> &Tokens, size_t Idx,
+                      Token &CurrentToken, Token::Type CurrentType) {
+  Token &PrevToken = Tokens[Idx - 1];
+  StringRef PrevTokenBody = PrevToken.getTokenBody();
+  StringRef Unindented = PrevTokenBody.rtrim(" \t\v");
+  size_t Indentation = PrevTokenBody.size() - Unindented.size();
+  if (CurrentType != Token::Type::Partial)
+    PrevToken.setTokenBody(Unindented);
+  CurrentToken.setIndentation(Indentation);
+}
+
+// Simple tokenizer that splits the template into tokens.
+// The mustache spec allows {{{ }}} to unescape variables
+// but we don't support that here. An unescape variable
+// is represented only by {{& variable}}.
+SmallVector<Token> tokenize(StringRef Template) {
+  SmallVector<Token> Tokens;
+  std::string Open("{{");
+  std::string Close("}}");
+  size_t Start = 0;
+  size_t DelimiterStart = Template.find(Open);
+  if (DelimiterStart == StringRef::npos) {
+    Tokens.emplace_back(Template);
+    return Tokens;
+  }
+  while (DelimiterStart != StringRef::npos) {
+    if (DelimiterStart != Start) {
+      Tokens.emplace_back(Template.substr(Start, DelimiterStart - Start));
+    }
+
+    size_t DelimiterEnd = Template.find(Close, DelimiterStart);
+    if (DelimiterEnd == StringRef::npos) {
+      break;
+    }
+
+    // Extract the Interpolated variable without delimiters {{ and }}
+    size_t InterpolatedStart = DelimiterStart + Open.size();
+    size_t InterpolatedEnd = DelimiterEnd - DelimiterStart - Close.size();
+    std::string Interpolated =
+        Template.substr(InterpolatedStart, InterpolatedEnd).str();
+    std::string RawBody = Open + Interpolated + Close;
+    Tokens.emplace_back(RawBody, Interpolated, Interpolated[0]);
+    Start = DelimiterEnd + Close.size();
+    DelimiterStart = Template.find(Open, Start);
+  }
+
+  if (Start < Template.size())
+    Tokens.emplace_back(Template.substr(Start));
+
+  // Fix up white spaces for:
+  //   - open sections
+  //   - inverted sections
+  //   - close sections
+  //   - comments
+  //
+  // This loop attempts to find standalone tokens and tries to trim out
+  // the surrounding whitespace.
+  // For example:
+  // if you have the template string
+  //  {{#section}} \n Example \n{{/section}}
+  // The output should would be
+  // For example
+  //  \n Example \n
+  size_t LastIdx = Tokens.size() - 1;
+  for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
+    Token &CurrentToken = Tokens[Idx];
+    Token::Type CurrentType = CurrentToken.getType();
+    // Check if token type requires cleanup
+    bool RequiresCleanUp = requiresCleanUp(CurrentType);
+
+    if (!RequiresCleanUp)
+      continue;
+
+    // We adjust the token body if there's no text behind or ahead.
+    // A token is considered to have no text ahead if the right of the previous
+    // token is a newline followed by spaces.
+    // A token is considered to have no text behind if the left of the next
+    // token is spaces followed by a newline.
+    // eg.
+    //  "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
+    bool HasTextBehind = hasTextBehind(Idx, Tokens);
+    bool HasTextAhead = hasTextAhead(Idx, Tokens);
+
+    if ((!HasTextAhead && !HasTextBehind) || (!HasTextAhead && Idx == 0))
+      stripTokenAhead(Tokens, Idx);
+    
+    if ((!HasTextBehind && !HasTextAhead) || (!HasTextBehind && Idx == LastIdx))
+      stripTokenBefore(Tokens, Idx, CurrentToken, CurrentType);
+  }
+  return Tokens;
+}
+
+class Parser {
+public:
+  Parser(StringRef TemplateStr, BumpPtrAllocator &Allocator)
+      : Allocator(Allocator), TemplateStr(TemplateStr) {}
+
+  ASTNode *parse();
+
+private:
+  void parseMustache(ASTNode *Parent);
+
+  BumpPtrAllocator &Allocator;
+  SmallVector<Token> Tokens;
+  size_t CurrentPtr;
+  StringRef TemplateStr;
+};
+
+ASTNode *Parser::parse() {
+  Tokens = tokenize(TemplateStr);
+  CurrentPtr = 0;
+  void *Root = Allocator.Allocate(sizeof(ASTNode), alignof(ASTNode));
+  ASTNode *RootNode = new (Root) ASTNode();
+  parseMustache(RootNode);
+  return RootNode;
+}
+
+void Parser::parseMustache(ASTNode *Parent) {
+
+  while (CurrentPtr < Tokens.size()) {
+    Token CurrentToken = Tokens[CurrentPtr];
+    CurrentPtr++;
+    Accessor A = CurrentToken.getAccessor();
+    ASTNode *CurrentNode;
+    void *Node = Allocator.Allocate(sizeof(ASTNode), alignof(ASTNode));
+
+    switch (CurrentToken.getType()) {
+    case Token::Type::Text: {
+      CurrentNode = new (Node) ASTNode(CurrentToken.getTokenBody(), Parent);
+      Parent->addChild(CurrentNode);
+      break;
+    }
+    case Token::Type::Variable: {
+      CurrentNode = new (Node) ASTNode(ASTNode::Variable, A, Parent);
+      Parent->addChild(CurrentNode);
+      break;
+    }
+    case Token::Type::UnescapeVariable: {
+      CurrentNode = new (Node) ASTNode(ASTNode::UnescapeVariable, A, Parent);
+      Parent->addChild(CurrentNode);
+      break;
+    }
+    case Token::Type::Partial: {
+      CurrentNode = new (Node) ASTNode(ASTNode::Partial, A, Parent);
+      CurrentNode->setIndentation(CurrentToken.getIndentation());
+      Parent->addChild(CurrentNode);
+      break;
+    }
+    case Token::Type::SectionOpen: {
+      CurrentNode = new (Node) ASTNode(ASTNode::Section, A, Parent);
+      size_t Start = CurrentPtr;
+      parseMustache(CurrentNode);
+      size_t End = CurrentPtr;
+      std::string RawBody;
+      for (std::size_t I = Start; I < End - 1; I++)
+        RawBody += Tokens[I].getRawBody();
+      CurrentNode->setRawBody(RawBody);
+      Parent->addChild(CurrentNode);
+      break;
+    }
+    case Token::Type::InvertSectionOpen: {
+      CurrentNode = new (Node) ASTNode(ASTNode::InvertSection, A, Parent);
+      size_t Start = CurrentPtr;
+      parseMustache(CurrentNode);
+      size_t End = CurrentPtr;
+      std::string RawBody;
+      for (size_t Idx = Start; Idx < End - 1; Idx++)
+        RawBody += Tokens[Idx].getRawBody();
+      CurrentNode->setRawBody(RawBody);
+      Parent->addChild(CurrentNode);
+      break;
+    }
+    case Token::Type::Comment:
+      break;
+    case Token::Type::SectionClose:
+      return;
+    }
+  }
+}
+
+void Template::render(Value &Data, llvm::raw_ostream &OS) {
+  Tree->render(Data, OS);
+  LocalAllocator.Reset();
+}
+
+void Template::registerPartial(StringRef Name, StringRef Partial) {
+  Parser P = Parser(Partial, Allocator);
+  ASTNode *PartialTree = P.parse();
+  PartialTree->setUpNode(LocalAllocator, Partials, Lambdas, SectionLambdas,
+                         Escapes);
+  Partials.insert(std::make_pair(Name, PartialTree));
+}
+
+void Template::registerLambda(StringRef Name, Lambda L) { Lambdas[Name] = L; }
+
+void Template::registerLambda(StringRef Name, SectionLambda L) {
+  SectionLambdas[Name] = L;
+}
+
+void Template::registerEscape(DenseMap<char, StringRef> E) { Escapes = E; }
+
+Template::Template(StringRef TemplateStr) {
+  Parser P = Parser(TemplateStr, Allocator);
+  Tree = P.parse();
+  // the default behaviour is to escape html entities
+  DenseMap<char, StringRef> HtmlEntities = {{'&', "&"},
+                                            {'<', "<"},
+                                            {'>', ">"},
+                                            {'"', """},
+                                            {'\'', "'"}};
+  registerEscape(HtmlEntities);
+  Tree->setUpNode(LocalAllocator, Partials, Lambdas, SectionLambdas, Escapes);
+}
+
+void toJsonString(const Value &Data, raw_ostream &OS) {
+  switch (Data.kind()) {
+  case Value::Null:
+    return;
+  case Value::Number: {
+    auto Num = *Data.getAsNumber();
+    std::ostringstream Oss;
+    Oss << Num;
+    OS << Oss.str();
+    return;
+  }
+  case Value::String: {
+    auto Str = *Data.getAsString();
+    OS << Str.str();
+    return;
+  }
+  case Value::Array: {
+    auto Arr = *Data.getAsArray();
+    if (Arr.empty())
+      return;
+  }
+  case Value::Object:
+  case Value::Boolean: {
+    llvm::json::OStream JOS(OS, 2);
+    JOS.value(Data);
+    break;
+  }
+  }
+}
+
+static bool isFalsey(const Value &V) {
----------------
ilovepi wrote:

Oh, I think these are in the `llvm::mustache` namespace. The static helpers should probably be in the anonymous namespace. usually we put APIs like this towards the top of the file to avoid opening and closing surrounding namespaces too much.

https://github.com/llvm/llvm-project/pull/105893