[Lldb-commits] [lldb] [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (PR #123521)

Mon Jan 20 02:14:38 PST 2025

================
@@ -0,0 +1,156 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H_
+#define LLDB_VALUEOBJECT_DILLEXER_H_
+
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <limits.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private {
+
+namespace dil {
+
+enum class TokenKind {
+  coloncolon,
+  eof,
+  identifier,
+  invalid,
+  kw_namespace,
+  l_paren,
+  none,
+  r_paren,
+  unknown,
+};
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class DILToken {
+public:
+  DILToken(dil::TokenKind kind, std::string spelling, uint32_t start)
+      : m_kind(kind), m_spelling(spelling), m_start_pos(start) {}
+
+  DILToken() : m_kind(dil::TokenKind::none), m_spelling(""), m_start_pos(0) {}
+
+  void setKind(dil::TokenKind kind) { m_kind = kind; }
+  dil::TokenKind getKind() const { return m_kind; }
+
+  std::string getSpelling() const { return m_spelling; }
+
+  uint32_t getLength() const { return m_spelling.size(); }
+
+  bool is(dil::TokenKind kind) const { return m_kind == kind; }
+
+  bool isNot(dil::TokenKind kind) const { return m_kind != kind; }
+
+  bool isOneOf(dil::TokenKind kind1, dil::TokenKind kind2) const {
+    return is(kind1) || is(kind2);
+  }
+
+  template <typename... Ts> bool isOneOf(dil::TokenKind kind, Ts... Ks) const {
+    return is(kind) || isOneOf(Ks...);
+  }
+
+  uint32_t getLocation() const { return m_start_pos; }
+
+  void setValues(dil::TokenKind kind, std::string spelling, uint32_t start) {
+    m_kind = kind;
+    m_spelling = spelling;
+    m_start_pos = start;
+  }
+
+  static const std::string getTokenName(dil::TokenKind kind);
+
+private:
+  dil::TokenKind m_kind;
+  std::string m_spelling;
+  uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+  DILLexer(llvm::StringRef dil_expr) : m_expr(dil_expr.str()) {
+    m_cur_pos = m_expr.begin();
+    // Use UINT_MAX to indicate invalid/uninitialized value.
+    m_tokens_idx = UINT_MAX;
+  }
+
+  bool Lex(DILToken &result, bool look_ahead = false);
+
+  bool Is_Word(std::string::iterator start, uint32_t &length);
+
+  uint32_t GetLocation() { return m_cur_pos - m_expr.begin(); }
+
+  /// Update 'result' with the other paremeter values, create a
+  /// duplicate token, and push the duplicate token onto the vector of
+  /// lexed tokens.
+  void UpdateLexedTokens(DILToken &result, dil::TokenKind tok_kind,
+                         std::string tok_str, uint32_t tok_pos);
+
+  /// Return the lexed token N+1 positions ahead of the 'current' token
+  /// being handled by the DIL parser.
+  const DILToken &LookAhead(uint32_t N);
+
+  const DILToken &AcceptLookAhead(uint32_t N);
----------------
labath wrote:

That would definitely be nice, and if its true, I'd consider taking this even further: given that we're going to be processing about a line of text at most, we might be able to just eagerly lex the whole string and avoid the whole on-demand parsing business. If so, maybe we don't even need the lexer *class* and the lexing could consist of a single function like `std::vector<Token> Lex(string)` ?

https://github.com/llvm/llvm-project/pull/123521