[Lldb-commits] [lldb] [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (PR #123521)

Sun Jan 19 13:01:56 PST 2025

================
@@ -0,0 +1,156 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
+#define LLDB_VALUEOBJECT_DILLEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <limits.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private {
+
+namespace dil {
+
+enum class TokenKind {
+  coloncolon,
+  eof,
+  identifier,
+  invalid,
+  kw_namespace,
+  l_paren,
+  none,
+  r_paren,
+  unknown,
+};
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class DILToken {
+public:
+  DILToken(dil::TokenKind kind, std::string spelling, uint32_t start)
+      : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
+
+  DILToken() : m_kind(dil::TokenKind::none), m_spelling(""), m_start_pos(0) {}
+
+  void setKind(dil::TokenKind kind) { m_kind = kind; }
+  dil::TokenKind getKind() const { return m_kind; }
+
+  std::string getSpelling() const { return m_spelling; }
+
+  uint32_t getLength() const { return m_spelling.size(); }
+
+  bool is(dil::TokenKind kind) const { return m_kind == kind; }
+
+  bool isNot(dil::TokenKind kind) const { return m_kind != kind; }
+
+  bool isOneOf(dil::TokenKind kind1, dil::TokenKind kind2) const {
+    return is(kind1) || is(kind2);
+  }
+
+  template <typename... Ts> bool isOneOf(dil::TokenKind kind, Ts... Ks) const {
+    return is(kind) || isOneOf(Ks...);
+  }
+
+  uint32_t getLocation() const { return m_start_pos; }
+
+  void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) {
+    m_kind = kind;
+    m_spelling = spelling;
+    m_start_pos = start;
+  }
+
+  static const std::string getTokenName(dil::TokenKind kind);
+
+private:
+  dil::TokenKind m_kind;
+  std::string m_spelling;
+  uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+  DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) {
+    m_cur_pos = m_expr.begin();
+    // Use UINT_MAX to indicate invalid/uninitialized value.
+    m_tokens_idx = UINT_MAX;
+  }
+
+  bool Lex(DILToken &result, bool look_ahead = false);
+
+  bool Is_Word(std::string::iterator start, uint32_t &length);
+
+  uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }
+
+  /// Update 'result' with the other paremeter values, create a
+  /// duplicate token, and push the duplicate token onto the vector of
+  /// lexed tokens.
+  void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
+                         std::string tok_str, uint32_t tok_pos);
+
+  /// Return the lexed token N+1 positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const DILToken &LookAhead(uint32_t N);
+
+  const DILToken &AcceptLookAhead(uint32_t N);
----------------
werat wrote:

I think this API might be simpler. The lexer doesn't actually need to re-lex, the results will always be the same. We only need to rollback occasionally, but we'll always be process the same sequence of tokens the second time.

So the lexer can always add the tokens to the `m_lexed_tokens` vector and we only need `GetCurrentTokenIdx()` and `ResetTokenIdx()` to do the rollback.

https://github.com/llvm/llvm-project/pull/123521