[Lldb-commits] [lldb] d9a7498 - [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (#123521)
via lldb-commits
lldb-commits at lists.llvm.org
Wed Feb 5 10:47:15 PST 2025
Author: cmtice
Date: 2025-02-05T10:47:11-08:00
New Revision: d9a7498aa24a35bdd95fd20a5c63e9495b6669f6
URL: https://github.com/llvm/llvm-project/commit/d9a7498aa24a35bdd95fd20a5c63e9495b6669f6
DIFF: https://github.com/llvm/llvm-project/commit/d9a7498aa24a35bdd95fd20a5c63e9495b6669f6.diff
LOG: [LLDB] Add Lexer (with tests) for DIL (Data Inspection Language). (#123521)
This adds the basic lexer, with unittests, for the Data Inspection
Language (DIL) -- see
https://discourse.llvm.org/t/rfc-data-inspection-language/69893
This version of the lexer only handles local variables and namespaces,
and is designed to work with
https://github.com/llvm/llvm-project/pull/120971.
Added:
lldb/include/lldb/ValueObject/DILLexer.h
lldb/source/ValueObject/DILLexer.cpp
lldb/unittests/ValueObject/DILLexerTests.cpp
Modified:
lldb/source/ValueObject/CMakeLists.txt
lldb/unittests/ValueObject/CMakeLists.txt
Removed:
################################################################################
diff --git a/lldb/include/lldb/ValueObject/DILLexer.h b/lldb/include/lldb/ValueObject/DILLexer.h
new file mode 100644
index 00000000000000..e1182da5b20ab2
--- /dev/null
+++ b/lldb/include/lldb/ValueObject/DILLexer.h
@@ -0,0 +1,123 @@
+//===-- DILLexer.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_VALUEOBJECT_DILLEXER_H
+#define LLDB_VALUEOBJECT_DILLEXER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace lldb_private::dil {
+
+/// Class defining the tokens generated by the DIL lexer and used by the
+/// DIL parser.
+class Token {
+public:
+ enum Kind {
+ coloncolon,
+ eof,
+ identifier,
+ l_paren,
+ r_paren,
+ };
+
+ Token(Kind kind, std::string spelling, uint32_t start)
+ : m_kind(kind), m_spelling(std::move(spelling)), m_start_pos(start) {}
+
+ Kind GetKind() const { return m_kind; }
+
+ std::string GetSpelling() const { return m_spelling; }
+
+ bool Is(Kind kind) const { return m_kind == kind; }
+
+ bool IsNot(Kind kind) const { return m_kind != kind; }
+
+ bool IsOneOf(Kind kind1, Kind kind2) const { return Is(kind1) || Is(kind2); }
+
+ template <typename... Ts> bool IsOneOf(Kind kind, Ts... Ks) const {
+ return Is(kind) || IsOneOf(Ks...);
+ }
+
+ uint32_t GetLocation() const { return m_start_pos; }
+
+ static llvm::StringRef GetTokenName(Kind kind);
+
+private:
+ Kind m_kind;
+ std::string m_spelling;
+ uint32_t m_start_pos; // within entire expression string
+};
+
+/// Class for doing the simple lexing required by DIL.
+class DILLexer {
+public:
+ /// Lexes all the tokens in expr and calls the private constructor
+ /// with the lexed tokens.
+ static llvm::Expected<DILLexer> Create(llvm::StringRef expr);
+
+ /// Return the current token to be handled by the DIL parser.
+ const Token &GetCurrentToken() { return m_lexed_tokens[m_tokens_idx]; }
+
+ /// Advance the current token position by N.
+ void Advance(uint32_t N = 1) {
+ if (m_tokens_idx + N >= m_lexed_tokens.size())
+ // N is too large; advance to the end of the lexed tokens.
+ m_tokens_idx = m_lexed_tokens.size() - 1;
+ else
+ m_tokens_idx += N;
+ }
+
+ /// Return the lexed token N positions ahead of the 'current' token
+ /// being handled by the DIL parser.
+ const Token &LookAhead(uint32_t N) {
+ if (m_tokens_idx + N < m_lexed_tokens.size())
+ return m_lexed_tokens[m_tokens_idx + N];
+
+ // Last token should be an 'eof' token.
+ return m_lexed_tokens.back();
+ }
+
+ /// Return the index for the 'current' token being handled by the DIL parser.
+ uint32_t GetCurrentTokenIdx() { return m_tokens_idx; }
+
+ /// Set the index for the 'current' token (to be handled by the parser)
+ /// to a particular position. Used for either committing 'look ahead' parsing
+ /// or rolling back tentative parsing.
+ void ResetTokenIdx(uint32_t new_value) {
+ assert(new_value < m_lexed_tokens.size());
+ m_tokens_idx = new_value;
+ }
+
+ uint32_t NumLexedTokens() { return m_lexed_tokens.size(); }
+
+private:
+ DILLexer(llvm::StringRef dil_expr, std::vector<Token> lexed_tokens)
+ : m_expr(dil_expr), m_lexed_tokens(std::move(lexed_tokens)),
+ m_tokens_idx(0) {}
+
+ static llvm::Expected<Token> Lex(llvm::StringRef expr,
+ llvm::StringRef &remainder);
+
+ // The input string we are lexing & parsing.
+ llvm::StringRef m_expr;
+
+ // Holds all of the tokens lexed so far.
+ std::vector<Token> m_lexed_tokens;
+
+ // Index into m_lexed_tokens; indicates which token the DIL parser is
+ // currently trying to parse/handle.
+ uint32_t m_tokens_idx;
+};
+
+} // namespace lldb_private::dil
+
+#endif // LLDB_VALUEOBJECT_DILLEXER_H
diff --git a/lldb/source/ValueObject/CMakeLists.txt b/lldb/source/ValueObject/CMakeLists.txt
index 70cb3d6d53f071..30c34472289e7b 100644
--- a/lldb/source/ValueObject/CMakeLists.txt
+++ b/lldb/source/ValueObject/CMakeLists.txt
@@ -1,4 +1,5 @@
add_lldb_library(lldbValueObject
+ DILLexer.cpp
ValueObject.cpp
ValueObjectCast.cpp
ValueObjectChild.cpp
diff --git a/lldb/source/ValueObject/DILLexer.cpp b/lldb/source/ValueObject/DILLexer.cpp
new file mode 100644
index 00000000000000..c7acfec347af48
--- /dev/null
+++ b/lldb/source/ValueObject/DILLexer.cpp
@@ -0,0 +1,97 @@
+//===-- DILLexer.cpp ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This implements the recursive descent parser for the Data Inspection
+// Language (DIL), and its helper functions, which will eventually underlie the
+// 'frame variable' command. The language that this parser recognizes is
+// described in lldb/docs/dil-expr-lang.ebnf
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "lldb/Utility/Status.h"
+#include "llvm/ADT/StringSwitch.h"
+
+namespace lldb_private::dil {
+
+llvm::StringRef Token::GetTokenName(Kind kind) {
+ switch (kind) {
+ case Kind::coloncolon:
+ return "coloncolon";
+ case Kind::eof:
+ return "eof";
+ case Kind::identifier:
+ return "identifier";
+ case Kind::l_paren:
+ return "l_paren";
+ case Kind::r_paren:
+ return "r_paren";
+ }
+}
+
+static bool IsLetter(char c) {
+ return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z');
+}
+
+static bool IsDigit(char c) { return '0' <= c && c <= '9'; }
+
+// A word starts with a letter, underscore, or dollar sign, followed by
+// letters ('a'..'z','A'..'Z'), digits ('0'..'9'), and/or underscores.
+static std::optional<llvm::StringRef> IsWord(llvm::StringRef expr,
+ llvm::StringRef &remainder) {
+ // Find the longest prefix consisting of letters, digits, underscors and
+ // '$'. If it doesn't start with a digit, then it's a word.
+ llvm::StringRef candidate = remainder.take_while(
+ [](char c) { return IsDigit(c) || IsLetter(c) || c == '_' || c == '$'; });
+ if (candidate.empty() || IsDigit(candidate[0]))
+ return std::nullopt;
+ remainder = remainder.drop_front(candidate.size());
+ return candidate;
+}
+
+llvm::Expected<DILLexer> DILLexer::Create(llvm::StringRef expr) {
+ std::vector<Token> tokens;
+ llvm::StringRef remainder = expr;
+ do {
+ if (llvm::Expected<Token> t = Lex(expr, remainder)) {
+ tokens.push_back(std::move(*t));
+ } else {
+ return t.takeError();
+ }
+ } while (tokens.back().GetKind() != Token::eof);
+ return DILLexer(expr, std::move(tokens));
+}
+
+llvm::Expected<Token> DILLexer::Lex(llvm::StringRef expr,
+ llvm::StringRef &remainder) {
+ // Skip over whitespace (spaces).
+ remainder = remainder.ltrim();
+ llvm::StringRef::iterator cur_pos = remainder.begin();
+
+ // Check to see if we've reached the end of our input string.
+ if (remainder.empty())
+ return Token(Token::eof, "", (uint32_t)expr.size());
+
+ uint32_t position = cur_pos - expr.begin();
+ std::optional<llvm::StringRef> maybe_word = IsWord(expr, remainder);
+ if (maybe_word)
+ return Token(Token::identifier, maybe_word->str(), position);
+
+ constexpr std::pair<Token::Kind, const char *> operators[] = {
+ {Token::l_paren, "("},
+ {Token::r_paren, ")"},
+ {Token::coloncolon, "::"},
+ };
+ for (auto [kind, str] : operators) {
+ if (remainder.consume_front(str))
+ return Token(kind, str, position);
+ }
+
+ // Unrecognized character(s) in string; unable to lex it.
+ return llvm::createStringError("Unable to lex input string");
+}
+
+} // namespace lldb_private::dil
diff --git a/lldb/unittests/ValueObject/CMakeLists.txt b/lldb/unittests/ValueObject/CMakeLists.txt
index 8fcc8d62a79979..14808aa2f213a5 100644
--- a/lldb/unittests/ValueObject/CMakeLists.txt
+++ b/lldb/unittests/ValueObject/CMakeLists.txt
@@ -1,10 +1,12 @@
add_lldb_unittest(LLDBValueObjectTests
DumpValueObjectOptionsTests.cpp
+ DILLexerTests.cpp
LINK_LIBS
lldbValueObject
lldbPluginPlatformLinux
lldbPluginScriptInterpreterNone
+ LLVMTestingSupport
LINK_COMPONENTS
Support
diff --git a/lldb/unittests/ValueObject/DILLexerTests.cpp b/lldb/unittests/ValueObject/DILLexerTests.cpp
new file mode 100644
index 00000000000000..9e5b8efd7af80a
--- /dev/null
+++ b/lldb/unittests/ValueObject/DILLexerTests.cpp
@@ -0,0 +1,156 @@
+//===-- DILLexerTests.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "lldb/ValueObject/DILLexer.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using llvm::StringRef;
+
+using namespace lldb_private::dil;
+
+llvm::Expected<std::vector<std::pair<Token::Kind, std::string>>>
+ExtractTokenData(llvm::StringRef input_expr) {
+
+ llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+ if (!maybe_lexer)
+ return maybe_lexer.takeError();
+ DILLexer lexer(*maybe_lexer);
+
+ std::vector<std::pair<Token::Kind, std::string>> data;
+ do {
+ Token tok = lexer.GetCurrentToken();
+ data.push_back(std::make_pair(tok.GetKind(), tok.GetSpelling()));
+ lexer.Advance();
+ } while (data.back().first != Token::eof);
+ // Don't return the eof token.
+ data.pop_back();
+ return data;
+}
+
+TEST(DILLexerTests, SimpleTest) {
+ StringRef input_expr("simple_var");
+ llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+ ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+ DILLexer lexer(*maybe_lexer);
+ Token token = lexer.GetCurrentToken();
+
+ EXPECT_EQ(token.GetKind(), Token::identifier);
+ EXPECT_EQ(token.GetSpelling(), "simple_var");
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), Token::eof);
+}
+
+TEST(DILLexerTests, TokenKindTest) {
+ Token token = Token(Token::identifier, "ident", 0);
+
+ EXPECT_TRUE(token.Is(Token::identifier));
+ EXPECT_FALSE(token.Is(Token::l_paren));
+ EXPECT_TRUE(token.IsOneOf(Token::eof, Token::identifier));
+ EXPECT_FALSE(token.IsOneOf(Token::l_paren, Token::r_paren, Token::coloncolon,
+ Token::eof));
+}
+
+TEST(DILLexerTests, LookAheadTest) {
+ StringRef input_expr("(anonymous namespace)::some_var");
+ llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(input_expr);
+ ASSERT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+ DILLexer lexer(*maybe_lexer);
+ Token token = lexer.GetCurrentToken();
+
+ // Current token is '('; check the next 4 tokens, to make
+ // sure they are the identifier 'anonymous', the identifier 'namespace'
+ // ')' and '::', in that order.
+ EXPECT_EQ(token.GetKind(), Token::l_paren);
+ EXPECT_EQ(lexer.LookAhead(1).GetKind(), Token::identifier);
+ EXPECT_EQ(lexer.LookAhead(1).GetSpelling(), "anonymous");
+ EXPECT_EQ(lexer.LookAhead(2).GetKind(), Token::identifier);
+ EXPECT_EQ(lexer.LookAhead(2).GetSpelling(), "namespace");
+ EXPECT_EQ(lexer.LookAhead(3).GetKind(), Token::r_paren);
+ EXPECT_EQ(lexer.LookAhead(4).GetKind(), Token::coloncolon);
+
+ // Our current index should still be 0, as we only looked ahead; we are still
+ // officially on the '('.
+ EXPECT_EQ(lexer.GetCurrentTokenIdx(), 0u);
+
+ // Accept the 'lookahead', so our current token is '::', which has the index
+ // 4 in our vector of tokens (which starts at zero).
+ lexer.Advance(4);
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), Token::coloncolon);
+ EXPECT_EQ(lexer.GetCurrentTokenIdx(), 4u);
+
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), Token::identifier);
+ EXPECT_EQ(token.GetSpelling(), "some_var");
+ EXPECT_EQ(lexer.GetCurrentTokenIdx(), 5u);
+ EXPECT_EQ(token.GetLocation(), strlen("(anonymous namespace)::"));
+
+ lexer.Advance();
+ token = lexer.GetCurrentToken();
+ EXPECT_EQ(token.GetKind(), Token::eof);
+}
+
+TEST(DILLexerTests, MultiTokenLexTest) {
+ EXPECT_THAT_EXPECTED(
+ ExtractTokenData("This string has (several ) ::identifiers"),
+ llvm::HasValue(testing::ElementsAre(
+ testing::Pair(Token::identifier, "This"),
+ testing::Pair(Token::identifier, "string"),
+ testing::Pair(Token::identifier, "has"),
+ testing::Pair(Token::l_paren, "("),
+ testing::Pair(Token::identifier, "several"),
+ testing::Pair(Token::r_paren, ")"),
+ testing::Pair(Token::coloncolon, "::"),
+ testing::Pair(Token::identifier, "identifiers"))));
+}
+
+TEST(DILLexerTests, IdentifiersTest) {
+ // These strings should lex into identifier tokens.
+ std::vector<std::string> valid_identifiers = {
+ "$My_name1", "$pc", "abcd", "_", "_a", "_a_", "$",
+ "a_b", "this", "self", "a", "MyName", "namespace"};
+
+ // The lexer can lex these strings, but they should not be identifiers.
+ std::vector<std::string> invalid_identifiers = {"", "::", "(", ")"};
+
+ // The lexer is expected to fail attempting to lex these strings (it cannot
+ // create valid tokens out of them).
+ std::vector<std::string> invalid_tok_strings = {"234", "2a", "2", "1MyName"};
+
+ // Verify that all of the valid identifiers come out as identifier tokens.
+ for (auto &str : valid_identifiers) {
+ SCOPED_TRACE(str);
+ EXPECT_THAT_EXPECTED(ExtractTokenData(str),
+ llvm::HasValue(testing::ElementsAre(
+ testing::Pair(Token::identifier, str))));
+ }
+
+ // Verify that the lexer fails on invalid token strings.
+ for (auto &str : invalid_tok_strings) {
+ SCOPED_TRACE(str);
+ auto maybe_lexer = DILLexer::Create(str);
+ EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Failed());
+ }
+
+ // Verify that none of the invalid identifiers come out as identifier tokens.
+ for (auto &str : invalid_identifiers) {
+ SCOPED_TRACE(str);
+ llvm::Expected<DILLexer> maybe_lexer = DILLexer::Create(str);
+ EXPECT_THAT_EXPECTED(maybe_lexer, llvm::Succeeded());
+ DILLexer lexer(*maybe_lexer);
+ Token token = lexer.GetCurrentToken();
+ EXPECT_TRUE(token.IsNot(Token::identifier));
+ EXPECT_TRUE(token.IsOneOf(Token::eof, Token::coloncolon, Token::l_paren,
+ Token::r_paren));
+ }
+}
More information about the lldb-commits
mailing list