[lld] [ELF] ScriptLexer: generate tokens lazily (PR #100493)

Mon Jul 29 12:27:55 PDT 2024

================
@@ -185,93 +194,30 @@ StringRef ScriptLexer::skipSpace(StringRef s) {
   }
 }
 
-// An erroneous token is handled as if it were the last token before EOF.
-bool ScriptLexer::atEOF() { return errorCount() || tokens.size() == pos; }
-
-// Split a given string as an expression.
-// This function returns "3", "*" and "5" for "3*5" for example.
-static std::vector<StringRef> tokenizeExpr(StringRef s) {
-  StringRef ops = "!~*/+-<>?^:="; // List of operators
-
-  // Quoted strings are literal strings, so we don't want to split it.
-  if (s.starts_with("\""))
-    return {s};
-
-  // Split S with operators as separators.
-  std::vector<StringRef> ret;
-  while (!s.empty()) {
-    size_t e = s.find_first_of(ops);
-
-    // No need to split if there is no operator.
-    if (e == StringRef::npos) {
-      ret.push_back(s);
-      break;
-    }
-
-    // Get a token before the operator.
-    if (e != 0)
-      ret.push_back(s.substr(0, e));
-
-    // Get the operator as a token.
-    // Keep !=, ==, >=, <=, << and >> operators as a single tokens.
-    if (s.substr(e).starts_with("!=") || s.substr(e).starts_with("==") ||
-        s.substr(e).starts_with(">=") || s.substr(e).starts_with("<=") ||
-        s.substr(e).starts_with("<<") || s.substr(e).starts_with(">>")) {
-      ret.push_back(s.substr(e, 2));
-      s = s.substr(e + 2);
-    } else {
-      ret.push_back(s.substr(e, 1));
-      s = s.substr(e + 1);
-    }
-  }
-  return ret;
-}
-
-// In contexts where expressions are expected, the lexer should apply
-// different tokenization rules than the default one. By default,
-// arithmetic operator characters are regular characters, but in the
-// expression context, they should be independent tokens.
-//
-// For example, "foo*3" should be tokenized to "foo", "*" and "3" only
-// in the expression context.
-//
-// This function may split the current token into multiple tokens.
-void ScriptLexer::maybeSplitExpr() {
-  if (!inExpr || errorCount() || atEOF())
-    return;
-
-  std::vector<StringRef> v = tokenizeExpr(tokens[pos]);
-  if (v.size() == 1)
-    return;
-  tokens.erase(tokens.begin() + pos);
-  tokens.insert(tokens.begin() + pos, v.begin(), v.end());
-}
+// Used to determine whether to stop parsing. Treat errors like EOF.
+bool ScriptLexer::atEOF() { return eof || errorCount(); }
 
 StringRef ScriptLexer::next() {
-  maybeSplitExpr();
-
-  if (errorCount())
-    return "";
-  if (atEOF()) {
-    setError("unexpected EOF");
-    return "";
-  }
-  return tokens[pos++];
+  prevTok = peek();
+  return std::exchange(curTok, StringRef(curBuf.s.data(), 0));
 }
 
 StringRef ScriptLexer::peek() {
----------------
mysterymath wrote:

Apologies; didn't get a chance to ask this before it went out.

The lexers I've typically seen preemptively read in the first token; `peek()` is a no-op that returns the previously read token, while `next()` is what reads the next token. Doing it this way creates another bit of state in the lexer: whether or not a token has been read in yet. That creates the issue with atEOF: unless `peek()` has been called at least once, it isn't yet known.

Could the LLD lexer be written that way, or is there a concern that I'm missing? It seems like it would have made less of an upset to the rest of the parser, and it seems simpler to reason about, having less state. (Somewhat subjective, for sure.) 

https://github.com/llvm/llvm-project/pull/100493