[clang-tools-extra] [llvm] [llvm] add support for mustache templating language (PR #105893)
Paul Kirth via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 15 11:21:03 PST 2024
================
@@ -0,0 +1,567 @@
+//===-- Mustache.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Mustache.h"
+#include "llvm/Support/Error.h"
+#include <sstream>
+
+using namespace llvm;
+using namespace llvm::json;
+using namespace llvm::mustache;
+
+SmallString<0> escapeString(StringRef Input,
+ DenseMap<char, StringRef> &Escape) {
+ SmallString<0> Output;
+ Output.reserve(Input.size());
+ for (char C : Input) {
+ auto It = Escape.find(C);
+ if (It != Escape.end())
+ Output += It->getSecond();
+ else
+ Output += C;
+ }
+ return Output;
+}
+
+Accessor split(StringRef Str, char Delimiter) {
+ Accessor Tokens;
+ if (Str == ".") {
+ Tokens.emplace_back(Str);
+ return Tokens;
+ }
+ StringRef Ref(Str);
+ while (!Ref.empty()) {
+ StringRef Part;
+ std::tie(Part, Ref) = Ref.split(Delimiter);
+ Tokens.emplace_back(Part.trim());
+ }
+ return Tokens;
+}
+
+void addIndentation(llvm::SmallString<0> &PartialStr, size_t IndentationSize) {
+ std::string Indent(IndentationSize, ' ');
+ llvm::SmallString<0> Result;
+ for (size_t I = 0; I < PartialStr.size(); ++I) {
+ Result.push_back(PartialStr[I]);
+ if (PartialStr[I] == '\n' && I < PartialStr.size() - 1)
+ Result.append(Indent);
+ }
+ PartialStr = Result;
+}
+
+Token::Token(StringRef RawBody, StringRef InnerBody, char Identifier)
+ : RawBody(RawBody), TokenBody(InnerBody), Indentation(0) {
+ TokenType = getTokenType(Identifier);
+ if (TokenType == Type::Comment)
+ return;
+
+ StringRef AccessorStr =
+ TokenType == Type::Variable ? InnerBody : InnerBody.substr(1);
+
+ Accessor = split(AccessorStr.trim(), '.');
+}
+
+Token::Token(StringRef Str)
+ : TokenType(Type::Text), RawBody(Str), Accessor({}), TokenBody(Str),
+ Indentation(0) {}
+
+Token::Type Token::getTokenType(char Identifier) {
+ switch (Identifier) {
+ case '#':
+ return Type::SectionOpen;
+ case '/':
+ return Type::SectionClose;
+ case '^':
+ return Type::InvertSectionOpen;
+ case '!':
+ return Type::Comment;
+ case '>':
+ return Type::Partial;
+ case '&':
+ return Type::UnescapeVariable;
+ default:
+ return Type::Variable;
+ }
+}
+
+// Function to check if there's no meaningful text behind
+bool noTextBehind(size_t Idx, const SmallVector<Token, 0> &Tokens) {
+ if (Idx == 0)
+ return false;
+
+ int PrevIdx = Idx - 1;
+ if (Tokens[PrevIdx].getType() != Token::Type::Text)
+ return false;
+
+ const Token &PrevToken = Tokens[Idx - 1];
+ StringRef TokenBody = PrevToken.getRawBody().rtrim(" \t\v\t");
+ // We make an exception for when previous token is empty
+ // and the current token is the second token
+ // ex.
+ // " {{#section}}A{{/section}}"
+ // the output of this is
+ // "A"
+ return TokenBody.ends_with("\n") || (TokenBody.empty() && Idx == 1);
+}
+
+// Function to check if there's no meaningful text ahead
+bool noTextAhead(size_t Idx, const SmallVector<Token, 0> &Tokens) {
+ if (Idx >= Tokens.size() - 1)
+ return false;
+
+ int PrevIdx = Idx + 1;
+ if (Tokens[Idx + 1].getType() != Token::Type::Text)
+ return false;
+
+ const Token &NextToken = Tokens[Idx + 1];
+ StringRef TokenBody = NextToken.getRawBody().ltrim(" ");
+ return TokenBody.starts_with("\r\n") || TokenBody.starts_with("\n");
+}
+
+bool requiresCleanUp(Token::Type T) {
+ // We must clean up all the tokens that could contain child nodes
+ return T == Token::Type::SectionOpen || T == Token::Type::InvertSectionOpen ||
+ T == Token::Type::SectionClose || T == Token::Type::Comment ||
+ T == Token::Type::Partial;
+}
+
+// Adjust next token body if there is no text ahead
+// eg.
+// The template string
+// "{{! Comment }} \nLine 2"
+// would be considered as no text ahead and should be render as
+// " Line 2"
+void stripTokenAhead(SmallVector<Token, 0> &Tokens, size_t Idx) {
+ Token &NextToken = Tokens[Idx + 1];
+ StringRef NextTokenBody = NextToken.getTokenBody();
+ // cut off the leading newline which could be \n or \r\n
+ if (NextTokenBody.starts_with("\r\n"))
+ NextToken.setTokenBody(NextTokenBody.substr(2));
+ else if (NextTokenBody.starts_with("\n"))
+ NextToken.setTokenBody(NextTokenBody.substr(1));
+}
+
+// Adjust previous token body if there no text behind
+// eg.
+// The template string
+// " \t{{#section}}A{{/section}}"
+// would be considered as no text ahead and should be render as
+// "A"
+// The exception for this is partial tag which requires us to
+// keep track of the indentation once it's rendered
+void stripTokenBefore(SmallVector<Token, 0> &Tokens, size_t Idx,
+ Token &CurrentToken, Token::Type CurrentType) {
+ Token &PrevToken = Tokens[Idx - 1];
+ StringRef PrevTokenBody = PrevToken.getTokenBody();
+ StringRef Unindented = PrevTokenBody.rtrim(" \t\v");
+ size_t Indentation = PrevTokenBody.size() - Unindented.size();
+ if (CurrentType != Token::Type::Partial)
+ PrevToken.setTokenBody(Unindented);
+ CurrentToken.setIndentation(Indentation);
+}
+
+// Simple tokenizer that splits the template into tokens.
+// The mustache spec allows {{{ }}} to unescape variables
+// but we don't support that here. An unescape variable
+// is represented only by {{& variable}}.
+SmallVector<Token, 0> tokenize(StringRef Template) {
+ SmallVector<Token, 0> Tokens;
+ StringRef Open("{{");
+ StringRef Close("}}");
+ size_t Start = 0;
+ size_t DelimiterStart = Template.find(Open);
+ if (DelimiterStart == StringRef::npos) {
+ Tokens.emplace_back(Template);
+ return Tokens;
+ }
+ while (DelimiterStart != StringRef::npos) {
+ if (DelimiterStart != Start) {
+ Tokens.emplace_back(Template.substr(Start, DelimiterStart - Start));
+ }
+
+ size_t DelimiterEnd = Template.find(Close, DelimiterStart);
+ if (DelimiterEnd == StringRef::npos) {
+ break;
+ }
+
+ // Extract the Interpolated variable without delimiters {{ and }}
+ size_t InterpolatedStart = DelimiterStart + Open.size();
+ size_t InterpolatedEnd = DelimiterEnd - DelimiterStart - Close.size();
+ SmallString<0> Interpolated =
+ Template.substr(InterpolatedStart, InterpolatedEnd);
+ SmallString<0> RawBody({Open, Interpolated, Close});
+ Tokens.emplace_back(RawBody, Interpolated, Interpolated[0]);
+ Start = DelimiterEnd + Close.size();
+ DelimiterStart = Template.find(Open, Start);
+ }
+
+ if (Start < Template.size())
+ Tokens.emplace_back(Template.substr(Start));
+
+ // Fix up white spaces for:
+ // - open sections
+ // - inverted sections
+ // - close sections
+ // - comments
+ //
+ // This loop attempts to find standalone tokens and tries to trim out
+ // the surrounding whitespace.
+ // For example:
+ // if you have the template string
+ // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
+ // The output would be
+ // "Line 1\n Line 2\n Line 3"
+ size_t LastIdx = Tokens.size() - 1;
+ for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
+ Token &CurrentToken = Tokens[Idx];
+ Token::Type CurrentType = CurrentToken.getType();
+ // Check if token type requires cleanup
+ bool RequiresCleanUp = requiresCleanUp(CurrentType);
+
+ if (!RequiresCleanUp)
+ continue;
+
+ // We adjust the token body if there's no text behind or ahead.
+ // A token is considered to have no text ahead if the right of the previous
+ // token is a newline followed by spaces.
+ // A token is considered to have no text behind if the left of the next
+ // token is spaces followed by a newline.
+ // eg.
+ // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
+ bool NoTextBehind = noTextBehind(Idx, Tokens);
+ bool NoTextAhead = noTextAhead(Idx, Tokens);
----------------
ilovepi wrote:
These are the only uses of these APIs, right? does it make more sense to use them in `stripTokenBefore/After` instead of here? or maybe since this is the only use, instead of the `noTextBehind/Ahead` APIs you should have `shouldStripTokenAhead/Behind`? some of that is going to boil down to which APIs you think are more useful.
https://github.com/llvm/llvm-project/pull/105893
More information about the llvm-commits
mailing list