[Lldb-commits] [lldb] [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (PR #123521)

Mon Jan 27 04:37:28 PST 2025

================
@@ -0,0 +1,189 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "lldb/Utility/Status.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace lldb_private {
+
+namespace dil {
+
+llvm::StringRef Token::GetTokenName(Kind kind) {
+  switch (kind) {
+  case Kind::coloncolon:
+    return "coloncolon";
+  case Kind::eof:
+    return "eof";
+  case Kind::identifier:
+    return "identifier";
+  case Kind::invalid:
+    return "invalid";
+  case Kind::kw_namespace:
+    return "namespace";
+  case Kind::l_paren:
+    return "l_paren";
+  case Kind::none:
+    return "none";
+  case Kind::r_paren:
+    return "r_paren";
+  case Kind::unknown:
+    return "unknown";
+  }
+}
+
+static bool IsLetter(char c) {
+  return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
+}
+
+static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+llvm::iterator_range<llvm::StringRef::iterator> DILLexer::IsWord() {
+  llvm::StringRef::iterator start = m_cur_pos;
+  bool dollar_start = false;
+
+  // Must not start with a digit.
+  if (m_cur_pos == m_expr.end() || IsDigit(*m_cur_pos))
+    return llvm::make_range(m_cur_pos, m_cur_pos);
+
+  // First character *may* be a '$', for a register name or convenience
+  // variable.
+  if (*m_cur_pos == '$') {
+    dollar_start = true;
+    ++m_cur_pos;
+  }
+
+  // Contains only letters, digits or underscores
+  for (; m_cur_pos != m_expr.end(); ++m_cur_pos) {
+    char c = *m_cur_pos;
+    if (!IsLetter(c) && !IsDigit(c) && c != '_')
+      break;
+  }
+
+  // If first char is '$', make sure there's at least one mare char, or it's
+  // invalid.
+  if (dollar_start && (m_cur_pos - start <= 1)) {
+    m_cur_pos = start;
+    return llvm::make_range(start, start); // Empty range
+  }
+
+  return llvm::make_range(start, m_cur_pos);
+}
+
+void DILLexer::UpdateLexedTokens(Token &result, Token::Kind tok_kind,
+                                 std::string tok_str, uint32_t tok_pos) {
+  Token new_token(tok_kind, tok_str, tok_pos);
+  result = new_token;
+  m_lexed_tokens.push_back(std::move(new_token));
+}
+
+llvm::Expected<bool> DILLexer::LexAll() {
+  bool done = false;
+  while (!done) {
+    auto tok_or_err = Lex();
+    if (!tok_or_err)
+      return tok_or_err.takeError();
+    Token token = *tok_or_err;
+    if (token.GetKind() == Token::eof) {
+      done = true;
+    }
+  }
+  return true;
+}
+
+llvm::Expected<Token> DILLexer::Lex() {
+  Token result;
+
+  // Skip over whitespace (spaces).
+  while (m_cur_pos != m_expr.end() && *m_cur_pos == ' ')
+    m_cur_pos++;
+
+  // Check to see if we've reached the end of our input string.
+  if (m_cur_pos == m_expr.end()) {
+    UpdateLexedTokens(result, Token::eof, "", (uint32_t)m_expr.size());
+    return result;
+  }
+
+  uint32_t position = m_cur_pos - m_expr.begin();
+  llvm::StringRef::iterator start = m_cur_pos;
+  llvm::iterator_range<llvm::StringRef::iterator> word_range = IsWord();
+  if (!word_range.empty()) {
+    uint32_t length = word_range.end() - word_range.begin();
+    llvm::StringRef word(m_expr.substr(position, length));
+    // We will be adding more keywords here in the future...
+    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
+                           .Case("namespace", Token::kw_namespace)
+                           .Default(Token::identifier);
+    UpdateLexedTokens(result, kind, word.str(), position);
+    return result;
+  }
+
+  m_cur_pos = start;
+  llvm::StringRef remainder(m_expr.substr(position, m_expr.end() - m_cur_pos));
+  std::vector<std::pair<Token::Kind, const char *>> operators = {
+      {Token::l_paren, "("},
+      {Token::r_paren, ")"},
+      {Token::coloncolon, "::"},
+  };
+  for (auto [kind, str] : operators) {
+    if (remainder.consume_front(str)) {
+      m_cur_pos += strlen(str);
+      UpdateLexedTokens(result, kind, str, position);
+      return result;
+    }
+  }
+
+  // Unrecognized character(s) in string; unable to lex it.
+  Status error("Unable to lex input string");
+  return error.ToError();
+}
----------------
labath wrote:

Sorry for rewriting this for you, but I figured its easier than explaining everything in abstract:

The main things I wanted to achieve by this are:
- no half-initialized state (object constructed, but LexAll not called). The object is always constructed fully parsed. It's basically what's described [here](https://llvm.org/docs/ProgrammersManual.html#fallible-constructors), but even better because there isn't even a privately-visible half-initialized state. (Since the only state of the is basically "the remainder of the string", I figured it's easier to pass it as arguments and construct the lexer only when it's done. This also lets us get rid of the m_cur_pos`` member which is only used in the initialization stage.
- I doubled down on the StringRef representation. I see you've partially used it, but that still meant that there were some awkward conversions between position-in-the-string and StringRef representations. Now they're gone. I also realized that `iterator_range<StringRef::iterator>` is just an (unnecessarily) fancy name for `StringRef`, so I just use that throughout. 
- no more `UpdateLexedTokens`. Just using `Token` as a value type. The overall programming style is also more functional - less side effects, more return values

The thing I did not do (but I still think it would be better is to replace the `std::vector<std::pair<>>` keyword representation with the "constexpr array of pairs"  I had in my original suggestion. I think that's better because the vector thing means you'll be constructing a new vector object every time you call this function. That's going to impact the performance more (although it will still probably be unnoticeable) than any StringSwitch usage, as it causes a memory allocation. If you think the use of a C array is obsolete, you can also use a `constexpr std::initializer_list<std::pair<>>`, but I find that just adds an unnecessary level of boilerplate.


```suggestion
llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
  std::vector<Token> tokens;
  llvm::StringRef remainder = expr;
  do {
    if (llvm::Expected<Token> t = Lex(expr, remainder))
      tokens.push_back(std:move(*t);
    else
     return t.takeError();
  } while (tokens.back().GetKind() != Token::eof);
  return DILLexer(std::move(tokens)); // calling a private constructor
}

static llvm::Expected<Token> Lex(llvm::StringRef expr, llvm::StringRef &remainder) {
  // Skip over whitespace.
  remainder = remainder.ltrim();

  size_t position = remainder.data()-expr.data();
  // Check to see if we've reached the end of our input string.
  if (remainder.empty())
    return Token(Token::eof, 
  if (m_cur_pos == m_expr.end())
    return Token(Token::eof, "", position);

  llvm::StringRef word = IsWord(remainder); // automatically updates `remainder`, you may be able to use things like `StringRef::drop_while` in the implementation
  if (!word_range.empty()) {
    // We will be adding more keywords here in the future...
    Token::Kind kind = llvm::StringSwitch<Token::Kind>(word)
                           .Case("namespace", Token::kw_namespace)
                           .Default(Token::identifier);
    return Token(kind, word.str(), position);
  }

  std::vector<std::pair<Token::Kind, const char *>> operators = {
      {Token::l_paren, "("},
      {Token::r_paren, ")"},
      {Token::coloncolon, "::"},
  };
  for (auto [kind, str] : operators) {
    if (remainder.consume_front(str))
      return Token(kind, str, position);
  }

  return llvm::createStringError("Unable to lex input string");
}
```


https://github.com/llvm/llvm-project/pull/123521