[llvm][mustache] Use single pass when tokenizing#159196
Merged
Merged
Conversation
This was referenced Sep 16, 2025
Contributor
Author
This was referenced Sep 16, 2025
Member
|
@llvm/pr-subscribers-llvm-support Author: Paul Kirth (ilovepi) ChangesThe old implementation used many string searches over the same portions
Full diff: https://github.com/llvm/llvm-project/pull/159196.diff 1 Files Affected:
diff --git a/llvm/lib/Support/Mustache.cpp b/llvm/lib/Support/Mustache.cpp
index f8095a4eb1acc..63798c50f57ee 100644
--- a/llvm/lib/Support/Mustache.cpp
+++ b/llvm/lib/Support/Mustache.cpp
@@ -371,141 +371,101 @@ static const char *jsonKindToString(json::Value::Kind K) {
llvm_unreachable("Unknown json::Value::Kind");
}
-static Tag findNextTag(StringRef Template, size_t StartPos, StringRef Open,
- StringRef Close) {
- const StringLiteral TripleOpen("{{{");
- const StringLiteral TripleClose("}}}");
-
- size_t NormalOpenPos = Template.find(Open, StartPos);
- size_t TripleOpenPos = Template.find(TripleOpen, StartPos);
-
- Tag Result;
-
- // Determine which tag comes first.
- if (TripleOpenPos != StringRef::npos &&
- (NormalOpenPos == StringRef::npos || TripleOpenPos <= NormalOpenPos)) {
- // Found a triple mustache tag.
- size_t EndPos =
- Template.find(TripleClose, TripleOpenPos + TripleOpen.size());
- if (EndPos == StringRef::npos)
- return Result; // No closing tag found.
-
- Result.TagKind = Tag::Kind::Triple;
- Result.StartPosition = TripleOpenPos;
- size_t ContentStart = TripleOpenPos + TripleOpen.size();
- Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
- Result.FullMatch = Template.substr(
- TripleOpenPos, (EndPos + TripleClose.size()) - TripleOpenPos);
- } else if (NormalOpenPos != StringRef::npos) {
- // Found a normal mustache tag.
- size_t EndPos = Template.find(Close, NormalOpenPos + Open.size());
- if (EndPos == StringRef::npos)
- return Result; // No closing tag found.
-
- Result.TagKind = Tag::Kind::Normal;
- Result.StartPosition = NormalOpenPos;
- size_t ContentStart = NormalOpenPos + Open.size();
- Result.Content = Template.substr(ContentStart, EndPos - ContentStart);
- Result.FullMatch =
- Template.substr(NormalOpenPos, (EndPos + Close.size()) - NormalOpenPos);
- }
-
- return Result;
-}
-
-static std::optional<std::pair<StringRef, StringRef>>
-processTag(const Tag &T, SmallVectorImpl<Token> &Tokens, MustacheContext &Ctx) {
- LLVM_DEBUG(dbgs() << "[Tag] " << T.FullMatch << ", Content: " << T.Content
- << ", Kind: " << tagKindToString(T.TagKind) << "\n");
- if (T.TagKind == Tag::Kind::Triple) {
- Tokens.emplace_back(T.FullMatch, Ctx.Saver.save("&" + T.Content), '&', Ctx);
- return std::nullopt;
- }
- StringRef Interpolated = T.Content;
- if (!Interpolated.trim().starts_with("=")) {
- char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
- Tokens.emplace_back(T.FullMatch, Interpolated, Front, Ctx);
- return std::nullopt;
- }
- Tokens.emplace_back(T.FullMatch, Interpolated, '=', Ctx);
- StringRef DelimSpec = Interpolated.trim();
- DelimSpec = DelimSpec.drop_front(1);
- DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
- DelimSpec = DelimSpec.trim();
-
- auto [NewOpen, NewClose] = DelimSpec.split(' ');
- LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
- << ", NewClose: " << NewClose << "\n");
- return std::make_pair(NewOpen, NewClose);
-}
-
// Simple tokenizer that splits the template into tokens.
-// The mustache spec allows {{{ }}} to unescape variables,
-// but we don't support that here. An unescape variable
-// is represented only by {{& variable}}.
static SmallVector<Token> tokenize(StringRef Template, MustacheContext &Ctx) {
LLVM_DEBUG(dbgs() << "[Tokenize Template] \"" << Template << "\"\n");
SmallVector<Token> Tokens;
SmallString<8> Open("{{");
SmallString<8> Close("}}");
- size_t Start = 0;
+ size_t Cursor = 0;
+ size_t TextStart = 0;
+
+ const StringLiteral TripleOpen("{{{");
+ const StringLiteral TripleClose("}}}");
- while (Start < Template.size()) {
- LLVM_DEBUG(dbgs() << "[Tokenize Loop] Start=" << Start << ", Open='" << Open
- << "', Close='" << Close << "'\n");
- Tag T = findNextTag(Template, Start, Open, Close);
+ while (Cursor < Template.size()) {
+ StringRef TemplateSuffix = Template.substr(Cursor);
+ StringRef TagOpen, TagClose;
+ Tag::Kind Kind;
+
+ // Determine which tag we've encountered.
+ if (TemplateSuffix.starts_with(TripleOpen)) {
+ Kind = Tag::Kind::Triple;
+ TagOpen = TripleOpen;
+ TagClose = TripleClose;
+ } else if (TemplateSuffix.starts_with(Open)) {
+ Kind = Tag::Kind::Normal;
+ TagOpen = Open;
+ TagClose = Close;
+ } else {
+ // Not at a tag, continue scanning.
+ ++Cursor;
+ continue;
+ }
- if (T.TagKind == Tag::Kind::None) {
- // No more tags, the rest is text.
- Tokens.emplace_back(Template.substr(Start));
- break;
+ // Found a tag, first add the preceding text.
+ if (Cursor > TextStart) {
+ Tokens.emplace_back(Template.slice(TextStart, Cursor));
}
- // Add the text before the tag.
- if (T.StartPosition > Start) {
- StringRef Text = Template.substr(Start, T.StartPosition - Start);
- Tokens.emplace_back(Text);
+ // Find the closing tag.
+ size_t EndPos = Template.find(TagClose, Cursor + TagOpen.size());
+ if (EndPos == StringRef::npos) {
+ // No closing tag, the rest is text.
+ Tokens.emplace_back(Template.substr(Cursor));
+ TextStart = Cursor = Template.size();
+ break;
}
- if (auto NewDelims = processTag(T, Tokens, Ctx)) {
- std::tie(Open, Close) = *NewDelims;
+ // Extract tag content and full match.
+ size_t ContentStart = Cursor + TagOpen.size();
+ StringRef Content = Template.substr(ContentStart, EndPos - ContentStart);
+ StringRef FullMatch =
+ Template.substr(Cursor, (EndPos + TagClose.size()) - Cursor);
+
+ // Process the tag (inlined logic from processTag).
+ LLVM_DEBUG(dbgs() << "[Tag] " << FullMatch << ", Content: " << Content
+ << ", Kind: " << tagKindToString(Kind) << "\n");
+ if (Kind == Tag::Kind::Triple) {
+ Tokens.emplace_back(FullMatch, Ctx.Saver.save("&" + Content), '&', Ctx);
+ } else { // Normal Tag
+ StringRef Interpolated = Content;
+ if (!Interpolated.trim().starts_with("=")) {
+ char Front = Interpolated.empty() ? ' ' : Interpolated.trim().front();
+ Tokens.emplace_back(FullMatch, Interpolated, Front, Ctx);
+ } else { // Set Delimiter
+ Tokens.emplace_back(FullMatch, Interpolated, '=', Ctx);
+ StringRef DelimSpec = Interpolated.trim();
+ DelimSpec = DelimSpec.drop_front(1);
+ DelimSpec = DelimSpec.take_until([](char C) { return C == '='; });
+ DelimSpec = DelimSpec.trim();
+
+ auto [NewOpen, NewClose] = DelimSpec.split(' ');
+ LLVM_DEBUG(dbgs() << "[Set Delimiter] NewOpen: " << NewOpen
+ << ", NewClose: " << NewClose << "\n");
+ Open = NewOpen;
+ Close = NewClose;
+ }
}
- // Move past the tag.
- Start = T.StartPosition + T.FullMatch.size();
+ // Move past the tag for the next iteration.
+ Cursor += FullMatch.size();
+ TextStart = Cursor;
}
- // Fix up white spaces for:
- // - open sections
- // - inverted sections
- // - close sections
- // - comments
- //
- // This loop attempts to find standalone tokens and tries to trim out
- // the surrounding whitespace.
- // For example:
- // if you have the template string
- // {{#section}} \n Example \n{{/section}}
- // The output should would be
- // For example:
- // \n Example \n
+ // Add any remaining text after the last tag.
+ if (TextStart < Template.size()) {
+ Tokens.emplace_back(Template.substr(TextStart));
+ }
+
+ // Fix up white spaces for standalone tags.
size_t LastIdx = Tokens.size() - 1;
for (size_t Idx = 0, End = Tokens.size(); Idx < End; ++Idx) {
Token &CurrentToken = Tokens[Idx];
Token::Type CurrentType = CurrentToken.getType();
- // Check if token type requires cleanup.
- bool RequiresCleanUp = requiresCleanUp(CurrentType);
-
- if (!RequiresCleanUp)
+ if (!requiresCleanUp(CurrentType))
continue;
- // We adjust the token body if there's no text behind or ahead.
- // A token is considered to have no text ahead if the right of the previous
- // token is a newline followed by spaces.
- // A token is considered to have no text behind if the left of the next
- // token is spaces followed by a newline.
- // eg.
- // "Line 1\n {{#section}} \n Line 2 \n {{/section}} \n Line 3"
bool HasTextBehind = hasTextBehind(Idx, Tokens);
bool HasTextAhead = hasTextAhead(Idx, Tokens);
|
b65b639 to
749306f
Compare
3ac408e to
b929e27
Compare
1389ea3 to
8ee94b4
Compare
1a9b251 to
dba238e
Compare
5b2c23c to
8b1efc3
Compare
dba238e to
038f7b2
Compare
8b1efc3 to
964edee
Compare
5c872a1 to
cbc1f38
Compare
a920b52 to
b9da266
Compare
cbc1f38 to
3d520d8
Compare
b9da266 to
11fe600
Compare
3d520d8 to
3e3e4ea
Compare
11fe600 to
59b5eda
Compare
709be29 to
537d1d7
Compare
59b5eda to
4b5d45b
Compare
537d1d7 to
fbaaa82
Compare
6020deb to
8474924
Compare
fbaaa82 to
a69791c
Compare
8474924 to
3289df4
Compare
a69791c to
4aa6943
Compare
4aa6943 to
3338a34
Compare
evelez7
approved these changes
Oct 20, 2025
evelez7
left a comment
Member
There was a problem hiding this comment.
LGTM with a couple styling nits
The old implementation used many string searches over the same portions of the strings. This version sacrifices some API niceness for perf wins. Metric | Baseline | Single-Pass | Change -------------- | -------- | ----------- | ------- Time (ms) | 36.09 | 35.78 | -0.86% Cycles | 35.3M | 35.0M | -0.79% Instructions | 86.7M | 85.8M | -1.03% Branch Misses | 116K | 114K | -1.91% Cache Misses | 244K | 232K | -4.98%
3338a34 to
ede7551
Compare
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.This suggestion is invalid because no changes were made to the code.Suggestions cannot be applied while the pull request is closed.Suggestions cannot be applied while viewing a subset of changes.Only one suggestion per line can be applied in a batch.Add this suggestion to a batch that can be applied as a single commit.Applying suggestions on deleted lines is not supported.You must change the existing code in this line in order to create a valid suggestion.Outdated suggestions cannot be applied.This suggestion has been applied or marked resolved.Suggestions cannot be applied from pending reviews.Suggestions cannot be applied on multi-line comments.Suggestions cannot be applied while the pull request is queued to merge.Suggestion cannot be applied right now. Please check back later.

The old implementation used many string searches over the same portions
of the strings. This version sacrifices some API niceness for perf wins.