[Lldb-commits] [lldb] [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (PR #123521)

Mon Jan 20 02:14:38 PST 2025

================
@@ -0,0 +1,205 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringMap.h"
+
+namespace lldb_private {
+
+namespace dil {
+
+// For fast keyword lookup. More keywords will be added later.
+const llvm::StringMap<dil::TokenKind> Keywords = {
+    {"namespace", dil::TokenKind::kw_namespace},
+};
+
+const std::string DILToken::getTokenName(dil::TokenKind kind) {
+  switch (kind) {
+  case dil::TokenKind::coloncolon:
+    return "coloncolon";
+  case dil::TokenKind::eof:
+    return "eof";
+  case dil::TokenKind::identifier:
+    return "identifier";
+  case dil::TokenKind::kw_namespace:
+    return "namespace";
+  case dil::TokenKind::l_paren:
+    return "l_paren";
+  case dil::TokenKind::r_paren:
+    return "r_paren";
+  case dil::TokenKind::unknown:
+    return "unknown";
+  default:
+    return "token_name";
+  }
+}
+
+static bool Is_Letter(char c) {
+  if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z'))
+    return true;
+  return false;
+}
+
+static bool Is_Digit(char c) { return ('0' <= c && c <= '9'); }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or  underscores.
+bool DILLexer::Is_Word(std::string::iterator start, uint32_t &length) {
+  bool done = false;
+  bool dollar_start = false;
+
+  // Must not start with a digit.
+  if (m_cur_pos == m_expr.end() || Is_Digit(*m_cur_pos))
+    return false;
+
+  // First character *may* be a '$', for a register name or convenience
+  // variable.
+  if (*m_cur_pos == '$') {
+    dollar_start = true;
+    ++m_cur_pos;
+    length++;
+  }
+
+  // Contains only letters, digits or underscores
+  for (; m_cur_pos != m_expr.end() && !done; ++m_cur_pos) {
+    char c = *m_cur_pos;
+    if (!Is_Letter(c) && !Is_Digit(c) && c != '_') {
+      done = true;
+      break;
+    } else
+      length++;
+  }
+
+  if (dollar_start && length > 1) // Must have something besides just '$'
+    return true;
+
+  if (!dollar_start && length > 0)
+    return true;
+
+  // Not a valid word, so re-set the lexing position.
+  m_cur_pos = start;
----------------
labath wrote:

AFAICT, this is the only use of the start argument, which makes for a very weird API. Perhaps the function could make a note of the starting position internally, and then return the range it found to the caller (I'd suggest a return type of `iterator_range<string::iterator>, with the empty range meaning "no word found")

https://github.com/llvm/llvm-project/pull/123521