[lld] [ELF] ScriptLexer: generate tokens lazily (PR #100493)

Tue Jul 30 09:30:39 PDT 2024

================
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
   }
 }
 
-// An erroneous token is handled as if it were the last token before EOF.
-bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
-
-// Split a given string as an expression.
-// This function returns "3", "*" and "5" for "3*5" for example.
-static std::vector<StringRef> tokenizeExpr(StringRef s) {
-  StringRef ops = "!~*/+-<>?^:="; // List of operators
-
-  // Quoted strings are literal strings, so we don't want to split it.
-  if (s.starts_with("\""))
-    return {s};
-
-  // Split S with operators as separators.
-  std::vector<StringRef> ret;
-  while (!s.empty()) {
-    size_t e = s.find_first_of(ops);
-
-    // No need to split if there is no operator.
-    if (e == StringRef::npos) {
-      ret.push_back(s);
-      break;
-    }
-
-    // Get a token before the operator.
-    if (e != 0)
-      ret.push_back(s.substr(0, e));
-
-    // Get the operator as a token.
-    // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
-    if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
-        s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
-        s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
-      ret.push_back(s.substr(e, 2));
-      s = s.substr(e + 2);
-    } else {
-      ret.push_back(s.substr(e, 1));
-      s = s.substr(e + 1);
-    }
-  }
-  return ret;
-}
-
-// In contexts where expressions are expected, the lexer should apply
-// different tokenization rules than the default one. By default,
-// arithmetic operator characters are regular characters, but in the
-// expression context, they should be independent tokens.
-//
-// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
-// in the expression context.
-//
-// This function may split the current token into multiple tokens.
-void ScriptLexer::maybeSplitExpr() {
-  if (!inExpr || errorCount() || atEOF())
-    return;
-
-  std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
-  if (v.size() == 1)
-    return;
-  tokens.erase(tokens.begin() + pos);
-  tokens.insert(tokens.begin() + pos, v.begin(), v.end());
-}
+// Used to determine whether to stop parsing. Treat errors like EOF.
+bool ScriptLexer::atEOF() { return eof || errorCount(); }
 
 StringRef ScriptLexer::next() {
-  maybeSplitExpr();
-
-  if (errorCount())
-    return "";
-  if (atEOF()) {
-    setError("unexpected EOF");
-    return "";
-  }
-  return tokens[pos++];
+  prevTok = peek();
+  return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
 }
 
 StringRef ScriptLexer::peek() {
----------------
mysterymath wrote:

> The lexer is indeed different from traditional implementations. The reason is to support states (currently just `inExpr` or not). `peek` caches the token at `curBuf`. `curBuf` might not be used if the state changes. Therefore, `peek` has to check `curBufState` - and prepares to drop the cache if the state changed by the parser.
> 
> If `next` is made to read the next token, the token might get stale as well when the state changes.

It seems like this could be addressed by having state changes do a little bit more work: discarding the current token and re-lexing a token in the new state.
 
> I think `atEOF` is still like traditional parsers, which require a `peek` to know whether EOF has been encountered.

This is surprising to me; I had thought that most either maintained a valid atEOF as an invariant, or even called it out as a designated token type. The latter is what I've more typically seen. 


https://github.com/llvm/llvm-project/pull/100493