[clang-tools-extra] [llvm] [llvm] add support for mustache templating language (PR #105893)

Wed Dec 4 18:43:45 PST 2024

================
@@ -0,0 +1,785 @@
+//===-- Mustache.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Mustache.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
+#include <sstream>
+
+using namespace llvm;
+using namespace llvm::json;
+using namespace llvm::mustache;
+
+namespace {
+
+static bool isFalsey(const Value &V) {
+  return V.getAsNull() || (V.getAsBoolean() && !V.getAsBoolean().value()) ||
+         (V.getAsArray() && V.getAsArray()->empty()) ||
+         (V.getAsObject() && V.getAsObject()->empty());
+}
+
+static Accessor splitMustacheString(StringRef Str) {
+  // We split the mustache string into an accessor.
+  // For example:
+  //    "a.b.c" would be split into {"a", "b", "c"}
+  // We make an exception for a single dot which
+  // refers to the current context.
+  Accessor Tokens;
+  if (Str == ".") {
+    Tokens.emplace_back(Str);
+    return Tokens;
+  }
+  while (!Str.empty()) {
+    StringRef Part;
+    std::tie(Part, Str) = Str.split(".");
+    Tokens.emplace_back(Part.trim());
+  }
+  return Tokens;
+}
+
+} // namespace
+
+namespace llvm {
+namespace mustache {
+
+class Token {
+public:
+  enum class Type {
+    Text,
+    Variable,
+    Partial,
+    SectionOpen,
+    SectionClose,
+    InvertSectionOpen,
+    UnescapeVariable,
+    Comment,
+  };
+
+  Token(std::string Str);
+
+  Token(std::string RawBody, std::string TokenBody, char Identifier);
+
+  StringRef getTokenBody() const { return TokenBody; };
+
+  StringRef getRawBody() const { return RawBody; };
+
+  void setTokenBody(std::string NewBody) { TokenBody = std::move(NewBody); };
+
+  Accessor getAccessor() const { return Accessor; };
+
+  Type getType() const { return TokenType; };
+
+  void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
+
+  size_t getIndentation() const { return Indentation; };
+
+  static Type getTokenType(char Identifier);
+
+private:
+  Type TokenType;
+  // RawBody is the original string that was tokenized.
+  std::string RawBody;
+  // TokenBody is the original string with the identifier removed.
+  std::string TokenBody;
+  Accessor Accessor;
+  size_t Indentation;
+};
+
+class ASTNode {
+public:
+  enum Type {
+    Root,
+    Text,
+    Partial,
+    Variable,
+    UnescapeVariable,
+    Section,
+    InvertSection,
+  };
+
+  ASTNode(llvm::BumpPtrAllocator &Alloc, llvm::StringMap<ASTNode *> &Partials,
+          llvm::StringMap<Lambda> &Lambdas,
+          llvm::StringMap<SectionLambda> &SectionLambdas,
+          llvm::DenseMap<char, std::string> &Escapes)
+      : Allocator(Alloc), Partials(Partials), Lambdas(Lambdas),
+        SectionLambdas(SectionLambdas), Escapes(Escapes), T(Type::Root),
+        ParentContext(nullptr) {};
+
+  ASTNode(llvm::StringRef Body, ASTNode *Parent, llvm::BumpPtrAllocator &Alloc,
+          llvm::StringMap<ASTNode *> &Partials,
+          llvm::StringMap<Lambda> &Lambdas,
+          llvm::StringMap<SectionLambda> &SectionLambdas,
+          llvm::DenseMap<char, std::string> &Escapes)
+      : Allocator(Alloc), Partials(Partials), Lambdas(Lambdas),
+        SectionLambdas(SectionLambdas), Escapes(Escapes), T(Type::Text),
+        Body(Body.str()), Parent(Parent), ParentContext(nullptr) {}
+
+  // Constructor for Section/InvertSection/Variable/UnescapeVariable Nodes
+  ASTNode(Type T, Accessor Accessor, ASTNode *Parent,
+          llvm::BumpPtrAllocator &Alloc, llvm::StringMap<ASTNode *> &Partials,
+          llvm::StringMap<Lambda> &Lambdas,
+          llvm::StringMap<SectionLambda> &SectionLambdas,
+          llvm::DenseMap<char, std::string> &Escapes)
+      : Allocator(Alloc), Partials(Partials), Lambdas(Lambdas),
+        SectionLambdas(SectionLambdas), Escapes(Escapes), T(T),
+        Accessor(Accessor), Parent(Parent), ParentContext(nullptr) {}
+
+  void addChild(ASTNode *Child) { Children.emplace_back(Child); };
+
+  void setRawBody(std::string NewBody) { RawBody = std::move(NewBody); };
+
+  void setIndentation(size_t NewIndentation) { Indentation = NewIndentation; };
+
+  void render(const llvm::json::Value &Data, llvm::raw_ostream &OS);
+
+private:
+  void renderLambdas(const llvm::json::Value &Contexts, llvm::raw_ostream &OS,
+                     Lambda &L);
+
+  void renderSectionLambdas(const llvm::json::Value &Contexts,
+                            llvm::raw_ostream &OS, SectionLambda &L);
+
+  void renderPartial(const llvm::json::Value &Contexts, llvm::raw_ostream &OS,
+                     ASTNode *Partial);
+
+  void renderChild(const llvm::json::Value &Context, llvm::raw_ostream &OS);
+
+  const llvm::json::Value *findContext();
+
+  llvm::BumpPtrAllocator &Allocator;
+  StringMap<ASTNode *> &Partials;
+  StringMap<Lambda> &Lambdas;
+  StringMap<SectionLambda> &SectionLambdas;
+  DenseMap<char, std::string> &Escapes;
+  Type T;
+  size_t Indentation = 0;
+  std::string RawBody;
+  std::string Body;
+  ASTNode *Parent;
+  // TODO: switch implementation to SmallVector<T>
+  std::vector<ASTNode *> Children;
+  const Accessor Accessor;
+  const llvm::json::Value *ParentContext;
+};
+
+// syntax wrapper for arena allocator for ASTNodes
+
+auto CreateRootNode =
+    [](void *Node, llvm::BumpPtrAllocator &Alloc,
+       llvm::StringMap<ASTNode *> &Partials, llvm::StringMap<Lambda> &Lambdas,
+       llvm::StringMap<SectionLambda> &SectionLambdas,
+       llvm::DenseMap<char, std::string> &Escapes) -> ASTNode * {
+  return new (Node) ASTNode(Alloc, Partials, Lambdas, SectionLambdas, Escapes);
+};
+
+auto CreateNode = [](void *Node, ASTNode::Type T, Accessor A, ASTNode *Parent,
+                     llvm::BumpPtrAllocator &Alloc,
+                     llvm::StringMap<ASTNode *> &Partials,
+                     llvm::StringMap<Lambda> &Lambdas,
+                     llvm::StringMap<SectionLambda> &SectionLambdas,
+                     llvm::DenseMap<char, std::string> &Escapes) -> ASTNode * {
+  return new (Node)
+      ASTNode(T, A, Parent, Alloc, Partials, Lambdas, SectionLambdas, Escapes);
+};
+
+auto CreateTextNode =
+    [](void *Node, StringRef Body, ASTNode *Parent,
+       llvm::BumpPtrAllocator &Alloc, llvm::StringMap<ASTNode *> &Partials,
+       llvm::StringMap<Lambda> &Lambdas,
+       llvm::StringMap<SectionLambda> &SectionLambdas,
+       llvm::DenseMap<char, std::string> &Escapes) -> ASTNode * {
+  return new (Node)
+      ASTNode(Body, Parent, Alloc, Partials, Lambdas, SectionLambdas, Escapes);
+};
+
+// Function to check if there is meaningful text behind.
+// We determine if a token has meaningful text behind
+// if the right of previous token contains anything that is
+// not a newline.
+// For example:
+//  "Stuff {{#Section}}" (returns true)
+//   vs
+//  "{{#Section}} \n" (returns false)
+// We make an exception for when previous token is empty
+// and the current token is the second token.
+// For example:
+//  "{{#Section}}"
+bool hasTextBehind(size_t Idx, const ArrayRef<Token> &Tokens) {
+  if (Idx == 0)
+    return true;
+
+  int PrevIdx = Idx - 1;
+  if (Tokens[PrevIdx].getType() != Token::Type::Text)
+    return true;
+
+  const Token &PrevToken = Tokens[PrevIdx];
+  StringRef TokenBody = PrevToken.getRawBody().rtrim(" \t\v");
+  return !TokenBody.ends_with("\n") && !(TokenBody.empty() && Idx == 1);
+}
+
+// Function to check if there's no meaningful text ahead.
+// We determine if a token has text ahead if the left of previous
+// token does not start with a newline.
+bool hasTextAhead(size_t Idx, const ArrayRef<Token> &Tokens) {
+  if (Idx >= Tokens.size() - 1)
+    return true;
+
+  int NextIdx = Idx + 1;
+  if (Tokens[NextIdx].getType() != Token::Type::Text)
+    return true;
+
+  const Token &NextToken = Tokens[NextIdx];
+  StringRef TokenBody = NextToken.getRawBody().ltrim(" ");
+  return !TokenBody.starts_with("\r\n") && !TokenBody.starts_with("\n");
+}
+
+bool requiresCleanUp(Token::Type T) {
+  // We must clean up all the tokens that could contain child nodes.
+  return T == Token::Type::SectionOpen || T == Token::Type::InvertSectionOpen ||
+         T == Token::Type::SectionClose || T == Token::Type::Comment ||
+         T == Token::Type::Partial;
+}
+
+// Adjust next token body if there is no text ahead.
+// For example:
+// The template string
+//  "{{! Comment }} \nLine 2"
+// would be considered as no text ahead and should be rendered as
+//  " Line 2"
+void stripTokenAhead(SmallVectorImpl<Token> &Tokens, size_t Idx) {
+  Token &NextToken = Tokens[Idx + 1];
+  StringRef NextTokenBody = NextToken.getTokenBody();
+  // cut off the leading newline which could be \n or \r\n
+  if (NextTokenBody.starts_with("\r\n"))
+    NextToken.setTokenBody(NextTokenBody.substr(2).str());
+  else if (NextTokenBody.starts_with("\n"))
+    NextToken.setTokenBody(NextTokenBody.substr(1).str());
+}
+
+// Adjust previous token body if there no text behind.
+// For example:
+//  The template string
+//  " \t{{#section}}A{{/section}}"
+// would be considered as having no text ahead and would be render as
+//  "A"
+// The exception for this is partial tag which requires us to
+// keep track of the indentation once it's rendered.
+void stripTokenBefore(SmallVectorImpl<Token> &Tokens, size_t Idx,
+                      Token &CurrentToken, Token::Type CurrentType) {
+  Token &PrevToken = Tokens[Idx - 1];
+  StringRef PrevTokenBody = PrevToken.getTokenBody();
+  StringRef Unindented = PrevTokenBody.rtrim(" \t\v");
+  size_t Indentation = PrevTokenBody.size() - Unindented.size();
+  if (CurrentType != Token::Type::Partial)
+    PrevToken.setTokenBody(Unindented.str());
+  CurrentToken.setIndentation(Indentation);
+}
+
+// Simple tokenizer that splits the template into tokens.
+// The mustache spec allows {{{ }}} to unescape variables,
+// but we don't support that here. An unescape variable
+// is represented only by {{& variable}}.
+SmallVector<Token> tokenize(StringRef Template) {
+  SmallVector<Token> Tokens;
+  const StringRef Open("{{");
+  const StringRef Close("}}");
----------------
ilovepi wrote:

`StringRef` is already const. you don't need to mark it as such.

https://github.com/llvm/llvm-project/pull/105893