[lld] [ELF] ScriptLexer: generate tokens lazily (PR #100493)
Fangrui Song via llvm-commits
llvm-commits at lists.llvm.org
Tue Jul 30 18:47:15 PDT 2024
================
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
}
}
-// An erroneous token is handled as if it were the last token before EOF.
-bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
-
-// Split a given string as an expression.
-// This function returns "3", "*" and "5" for "3*5" for example.
-static std::vector<StringRef> tokenizeExpr(StringRef s) {
- StringRef ops = "!~*/+-<>?^:="; // List of operators
-
- // Quoted strings are literal strings, so we don't want to split it.
- if (s.starts_with("\""))
- return {s};
-
- // Split S with operators as separators.
- std::vector<StringRef> ret;
- while (!s.empty()) {
- size_t e = s.find_first_of(ops);
-
- // No need to split if there is no operator.
- if (e == StringRef::npos) {
- ret.push_back(s);
- break;
- }
-
- // Get a token before the operator.
- if (e != 0)
- ret.push_back(s.substr(0, e));
-
- // Get the operator as a token.
- // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
- if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
- s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
- s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
- ret.push_back(s.substr(e, 2));
- s = s.substr(e + 2);
- } else {
- ret.push_back(s.substr(e, 1));
- s = s.substr(e + 1);
- }
- }
- return ret;
-}
-
-// In contexts where expressions are expected, the lexer should apply
-// different tokenization rules than the default one. By default,
-// arithmetic operator characters are regular characters, but in the
-// expression context, they should be independent tokens.
-//
-// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
-// in the expression context.
-//
-// This function may split the current token into multiple tokens.
-void ScriptLexer::maybeSplitExpr() {
- if (!inExpr || errorCount() || atEOF())
- return;
-
- std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
- if (v.size() == 1)
- return;
- tokens.erase(tokens.begin() + pos);
- tokens.insert(tokens.begin() + pos, v.begin(), v.end());
-}
+// Used to determine whether to stop parsing. Treat errors like EOF.
+bool ScriptLexer::atEOF() { return eof || errorCount(); }
StringRef ScriptLexer::next() {
- maybeSplitExpr();
-
- if (errorCount())
- return "";
- if (atEOF()) {
- setError("unexpected EOF");
- return "";
- }
- return tokens[pos++];
+ prevTok = peek();
+ return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
}
StringRef ScriptLexer::peek() {
----------------
MaskRay wrote:
> It seems like this could be addressed by having state changes do a little bit more work: discarding the current token and re-lexing a token in the new state.
This is exactly what the current `peek()` does.
Are you suggesting that `next()` should consume the current token and peek the next one?
Since we don't always call `peek()` before `next()`, this would require `next()` to call peek twice, which would be awkward.
---
Currently, when `peek()` or `next()` returns the empty string, they indicate an EOF token.
https://github.com/llvm/llvm-project/pull/100493
More information about the llvm-commits
mailing list