[llvm] [llvm][mustache] Use single pass when tokenizing (PR #159196)
Paul Kirth via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 10 09:21:30 PDT 2025
https://github.com/ilovepi updated https://github.com/llvm/llvm-project/pull/159196
>From 3338a342ceed80586ed2f580ee97333df1a77a38 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth at google.com>
Date: Mon, 15 Sep 2025 23:27:50 -0700
Subject: [PATCH] [llvm][mustache] Use single pass when tokenizing
The old implementation used many string searches over the same portions
of the strings. This version sacrifices some API niceness for perf wins.
Metric | Baseline | Single-Pass | Change
-------------- | -------- | ----------- | -------
Time (ms) | 36.09 | 35.78 | -0.86%
Cycles | 35.3M | 35.0M | -0.79%
Instructions | 86.7M | 85.8M | -1.03%
Branch Misses | 116K | 114K | -1.91%
Cache Misses | 244K | 232K | -4.98%
---
llvm/lib/Support/Mustache.cpp | 186 +++++++++++++---------------------
1 file changed, 73 insertions(+), 113 deletions(-)
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index 708e79d39cd21..13fbc61ca94e4 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -368,141 +368,101 @@ struct Tag {
llvm_unreachable("Unknown json::Value::Kind");
}
-static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
- StringRef Close) {
- const StringLiteral TripleOpen("{{{");
- const StringLiteral TripleClose("}}}");
-
- size_t NormalOpenPos = Template.find(Open, StartPos);
- size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
-
- Tag Result;
-
- // Determine which tag comes first.
- if (TripleOpenPos != StringRef::npos &&
- (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
- // Found a triple mustache tag.
- size_t EndPos =
- Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
- if (EndPos == StringRef::npos)
- return Result; // No closing tag found.
-
- Result.TagKind = Tag::Kind::Triple;
- Result.StartPosition = TripleOpenPos;
- size_t ContentStart = TripleOpenPos + TripleOpen.size();
- Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
- Result.FullMatch = Template.substr(
- TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
- } else if (NormalOpenPos != StringRef::npos) {
- // Found a normal mustache tag.
- size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
- if (EndPos == StringRef::npos)
- return Result; // No closing tag found.
-
- Result.TagKind = Tag::Kind::Normal;
- Result.StartPosition = NormalOpenPos;
- size_t ContentStart = NormalOpenPos + Open.size();
- Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
- Result.FullMatch =
- Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
- }
-
- return Result;
-}
-
-static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
- LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
- << ", Kind: " << tagKindToString(T.TagKind) << "\n");
- if (T.TagKind == Tag::Kind::Triple) {
- Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
- return std::nullopt;
- }
- StringRef Interpolated = T.Content;
- if (!Interpolated.trim().starts_with("=")) {
- char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
- Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
- return std::nullopt;
- }
- Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
- StringRef DelimSpec = Interpolated.trim();
- DelimSpec = DelimSpec.drop_front(1);
- DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
- DelimSpec = DelimSpec.trim();
-
- std::pair<StringRef, StringRef> Ret = DelimSpec.split(' ');
- LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << Ret.first
- << ", NewClose: " << Ret.second << "\n");
- return Ret;
-}
-
// Simple tokenizer that splits the template into tokens.
-// The mustache spec allows {{{ }}} to unescape variables,
-// but we don't support that here. An unescape variable
-// is represented only by {{& variable}}.
static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
SmallVector<Token> Tokens;
SmallString<8> Open("{{");
SmallString<8> Close("}}");
- size_t Start = 0;
+ size_t Cursor = 0;
+ size_t TextStart = 0;
+
+ const StringLiteral TripleOpen("{{{");
+ const StringLiteral TripleClose("}}}");
- while (Start < Template.size()) {
- LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start:" << Start << ", Open:'" << Open
- << "', Close:'" << Close << "'\n");
- Tag T = findNextTag(Template, Start, Open, Close);
+ while (Cursor < Template.size()) {
+ StringRef TemplateSuffix = Template.substr(Cursor);
+ StringRef TagOpen, TagClose;
+ Tag::Kind Kind;
+
+ // Determine which tag we've encountered.
+ if (TemplateSuffix.starts_with(TripleOpen)) {
+ Kind = Tag::Kind::Triple;
+ TagOpen = TripleOpen;
+ TagClose = TripleClose;
+ } else if (TemplateSuffix.starts_with(Open)) {
+ Kind = Tag::Kind::Normal;
+ TagOpen = Open;
+ TagClose = Close;
+ } else {
+ // Not at a tag, continue scanning.
+ ++Cursor;
+ continue;
+ }
- if (T.TagKind == Tag::Kind::None) {
- // No more tags, the rest is text.
- Tokens.emplace_back(Template.substr(Start));
- break;
+ // Found a tag, first add the preceding text.
+ if (Cursor > TextStart) {
+ Tokens.emplace_back(Template.slice(TextStart, Cursor));
}
- // Add the text before the tag.
- if (T.StartPosition > Start) {
- StringRef Text = Template.substr(Start, T.StartPosition - Start);
- Tokens.emplace_back(Text);
+ // Find the closing tag.
+ size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
+ if (EndPos == StringRef::npos) {
+ // No closing tag, the rest is text.
+ Tokens.emplace_back(Template.substr(Cursor));
+ TextStart = Cursor = Template.size();
+ break;
}
- if (auto NewDelims = processTag(T, Tokens, Ctx)) {
- std::tie(Open, Close) = *NewDelims;
+ // Extract tag content and full match.
+ size_t ContentStart = Cursor + TagOpen.size();
+ StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
+ StringRef FullMatch =
+ Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
+
+ // Process the tag (inlined logic from processTag).
+ LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
+ << ", Kind: " << tagKindToString(Kind) << "\n");
+ if (Kind == Tag::Kind::Triple) {
+ Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
+ } else { // Normal Tag
+ StringRef Interpolated = Content;
+ if (!Interpolated.trim().starts_with("=")) {
+ char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+ Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
+ } else { // Set Delimiter
+ Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
+ StringRef DelimSpec = Interpolated.trim();
+ DelimSpec = DelimSpec.drop_front(1);
+ DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+ DelimSpec = DelimSpec.trim();
+
+ auto [NewOpen, NewClose] = DelimSpec.split(' ');
+ LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
+ << ", NewClose: " << NewClose << "\n");
+ Open = NewOpen;
+ Close = NewClose;
+ }
}
- // Move past the tag.
- Start = T.StartPosition + T.FullMatch.size();
+ // Move past the tag for the next iteration.
+ Cursor += FullMatch.size();
+ TextStart = Cursor;
}
- // Fix up white spaces for:
- // - open sections
- // - inverted sections
- // - close sections
- // - comments
- //
- // This loop attempts to find standalone tokens and tries to trim out
- // the surrounding whitespace.
- // For example:
- // if you have the template string
- // {{#section}} \n Example \n{{/section}}
- // The output should would be
- // For example:
- // \n Example \n
+ // Add any remaining text after the last tag.
+ if (TextStart < Template.size()) {
+ Tokens.emplace_back(Template.substr(TextStart));
+ }
+
+ // Fix up white spaces for standalone tags.
size_t LastIdx = Tokens.size() - 1;
for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
Token &CurrentToken = Tokens[Idx];
Token::Type CurrentType = CurrentToken.getType();
- // Check if token type requires cleanup.
- bool RequiresCleanUp = requiresCleanUp(CurrentType);
-
- if (!RequiresCleanUp)
+ if (!requiresCleanUp(CurrentType))
continue;
- // We adjust the token body if there's no text behind or ahead.
- // A token is considered to have no text ahead if the right of the previous
- // token is a newline followed by spaces.
- // A token is considered to have no text behind if the left of the next
- // token is spaces followed by a newline.
- // eg.
- // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
bool HasTextBehind = hasTextBehind(Idx, Tokens);
bool HasTextAhead = hasTextAhead(Idx, Tokens);
More information about the llvm-commits
mailing list