[Lldb-commits] [lldb] [lldb] Implement a formatter bytecode interpreter in C++ (PR #114333)

Wed Oct 30 17:04:21 PDT 2024

llvmbot wrote:




@llvm/pr-subscribers-lldb

Author: Adrian Prantl (adrian-prantl)

<details>
<summary>Changes</summary>

Compared to the python version, this also does type checking and error
handling, so it's slightly longer, however, it's still comfortably
under 500 lines.

See https://discourse.llvm.org/t/a-bytecode-for-lldb-data-formatters/82696 for more context!

This is currently a draft, I still want to add more tests and also extend the metadata with (show children) flag and potentially others.

---

Patch is 47.47 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114333.diff


19 Files Affected:

- (modified) lldb/include/lldb/DataFormatters/TypeSummary.h (+21-1) 
- (modified) lldb/include/lldb/lldb-enumerations.h (+2) 
- (modified) lldb/source/Core/Section.cpp (+4) 
- (modified) lldb/source/DataFormatters/CMakeLists.txt (+1) 
- (added) lldb/source/DataFormatters/FormatterBytecode.cpp (+576) 
- (added) lldb/source/DataFormatters/FormatterBytecode.def (+101) 
- (added) lldb/source/DataFormatters/FormatterBytecode.h (+63) 
- (modified) lldb/source/DataFormatters/TypeSummary.cpp (+71-3) 
- (modified) lldb/source/Plugins/ObjectFile/ELF/ObjectFileELF.cpp (+2) 
- (modified) lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp (+8) 
- (modified) lldb/source/Plugins/ObjectFile/PECOFF/ObjectFilePECOFF.cpp (+2) 
- (modified) lldb/source/Symbol/ObjectFile.cpp (+1) 
- (modified) lldb/source/Target/Target.cpp (+151) 
- (added) lldb/test/API/functionalities/data-formatter/bytecode-summary/Makefile (+2) 
- (added) lldb/test/API/functionalities/data-formatter/bytecode-summary/TestBytecodeSummary.py (+14) 
- (added) lldb/test/API/functionalities/data-formatter/bytecode-summary/main.cpp (+36) 
- (added) lldb/test/API/functionalities/data-formatter/embedded-summary/Makefile (+2) 
- (added) lldb/test/API/functionalities/data-formatter/embedded-summary/TestEmbeddedTypeSummary.py (+12) 
- (added) lldb/test/API/functionalities/data-formatter/embedded-summary/main.c (+22) 


``````````diff

diff --git a/lldb/include/lldb/DataFormatters/TypeSummary.h b/lldb/include/lldb/DataFormatters/TypeSummary.h
index 382824aa2813da..0d8e46fa0b1598 100644
--- a/lldb/include/lldb/DataFormatters/TypeSummary.h
+++ b/lldb/include/lldb/DataFormatters/TypeSummary.h
@@ -22,6 +22,10 @@
 #include "lldb/Utility/Status.h"
 #include "lldb/Utility/StructuredData.h"
 
+namespace llvm {
+class MemoryBuffer;
+}
+
 namespace lldb_private {
 class TypeSummaryOptions {
 public:
@@ -44,7 +48,7 @@ class TypeSummaryOptions {
 
 class TypeSummaryImpl {
 public:
-  enum class Kind { eSummaryString, eScript, eCallback, eInternal };
+  enum class Kind { eSummaryString, eScript, eBytecode, eCallback, eInternal };
 
   virtual ~TypeSummaryImpl() = default;
 
@@ -409,6 +413,22 @@ struct ScriptSummaryFormat : public TypeSummaryImpl {
   ScriptSummaryFormat(const ScriptSummaryFormat &) = delete;
   const ScriptSummaryFormat &operator=(const ScriptSummaryFormat &) = delete;
 };
+
+/// A summary formatter that is defined in LLDB formmater bytecode.
+class BytecodeSummaryFormat : public TypeSummaryImpl {
+  std::unique_ptr<llvm::MemoryBuffer> m_bytecode;
+public:
+  BytecodeSummaryFormat(const TypeSummaryImpl::Flags &flags,
+                        std::unique_ptr<llvm::MemoryBuffer> bytecode);
+  bool FormatObject(ValueObject *valobj, std::string &dest,
+                    const TypeSummaryOptions &options) override;
+  std::string GetDescription() override;
+  std::string GetName() override;
+  static bool classof(const TypeSummaryImpl *S) {
+    return S->GetKind() == Kind::eBytecode;
+  }
+};
+
 } // namespace lldb_private
 
 #endif // LLDB_DATAFORMATTERS_TYPESUMMARY_H
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index 938f6e3abe8f2a..b2f0943d5a9260 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -761,6 +761,8 @@ enum SectionType {
   eSectionTypeDWARFDebugLocListsDwo,
   eSectionTypeDWARFDebugTuIndex,
   eSectionTypeCTF,
+  eSectionTypeLLDBTypeSummaries,
+  eSectionTypeLLDBFormatters,
   eSectionTypeSwiftModules,
 };
 
diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 0763e88d4608f4..3b5ca2c6785ef0 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -147,6 +147,8 @@ const char *Section::GetTypeAsCString() const {
     return "dwarf-gnu-debugaltlink";
   case eSectionTypeCTF:
     return "ctf";
+  case eSectionTypeLLDBTypeSummaries:
+    return "lldb-type-summaries";
   case eSectionTypeOther:
     return "regular";
   case eSectionTypeSwiftModules:
@@ -457,6 +459,8 @@ bool Section::ContainsOnlyDebugInfo() const {
   case eSectionTypeDWARFAppleObjC:
   case eSectionTypeDWARFGNUDebugAltLink:
   case eSectionTypeCTF:
+  case eSectionTypeLLDBTypeSummaries:
+  case eSectionTypeLLDBFormatters:
   case eSectionTypeSwiftModules:
     return true;
   }
diff --git a/lldb/source/DataFormatters/CMakeLists.txt b/lldb/source/DataFormatters/CMakeLists.txt
index 7f48a2785c73f5..17da138227d4f1 100644
--- a/lldb/source/DataFormatters/CMakeLists.txt
+++ b/lldb/source/DataFormatters/CMakeLists.txt
@@ -5,6 +5,7 @@ add_lldb_library(lldbDataFormatters NO_PLUGIN_DEPENDENCIES
   FormatCache.cpp
   FormatClasses.cpp
   FormatManager.cpp
+  FormatterBytecode.cpp
   FormattersHelpers.cpp
   LanguageCategory.cpp
   StringPrinter.cpp
diff --git a/lldb/source/DataFormatters/FormatterBytecode.cpp b/lldb/source/DataFormatters/FormatterBytecode.cpp
new file mode 100644
index 00000000000000..7e8bfd3a370ce7
--- /dev/null
+++ b/lldb/source/DataFormatters/FormatterBytecode.cpp
@@ -0,0 +1,576 @@
+//===-- FormatterBytecode.cpp ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "FormatterBytecode.h"
+#include "lldb/Core/ValueObject.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadicDetails.h"
+#include "llvm/Support/FormatProviders.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace lldb;
+namespace lldb_private {
+
+std::string toString(FormatterBytecode::OpCodes op) {
+  switch (op) {
+#define DEFINE_OPCODE(OP, MNEMONIC, NAME)                                      \
+  case OP: {                                                                   \
+    const char *s = MNEMONIC;                                                  \
+    return s ? s : #NAME;                                                      \
+  }
+#include "FormatterBytecode.def"
+#undef DEFINE_SIGNATURE
+  }
+  return llvm::utostr(op);
+}
+
+std::string toString(FormatterBytecode::Selectors sel) {
+  switch (sel) {
+#define DEFINE_SELECTOR(ID, NAME)                                              \
+  case ID:                                                                     \
+    return "@" #NAME;
+#include "FormatterBytecode.def"
+#undef DEFINE_SIGNATURE
+  }
+  return "@"+llvm::utostr(sel);
+}
+
+std::string toString(FormatterBytecode::Signatures sig) {
+  switch (sig) {
+#define DEFINE_SIGNATURE(ID, NAME)                                             \
+  case ID:                                                                     \
+    return "@" #NAME;
+#include "FormatterBytecode.def"
+#undef DEFINE_SIGNATURE
+  }
+  return llvm::utostr(sig);
+}
+
+std::string toString(const FormatterBytecode::DataStack &data) {
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  os << "[ ";
+  for (auto &d : data) {
+    if (auto s = std::get_if<std::string>(&d))
+      os << '"' << *s << '"';
+    else if (auto u = std::get_if<uint64_t>(&d))
+      os << *u << 'u';
+    else if (auto i = std::get_if<int64_t>(&d))
+      os << *i;
+    else if (auto valobj = std::get_if<ValueObjectSP>(&d)) {
+      if (!valobj->get())
+        os << "null";
+      else
+        os << "object(" << valobj->get()->GetValueAsCString() << ')';
+    } else if (auto type = std::get_if<CompilerType>(&d)) {
+      os << '(' << type->GetTypeName(true) << ')';
+    } else if (auto sel = std::get_if<FormatterBytecode::Selectors>(&d)) {
+      os << toString(*sel);
+    }
+    os << ' ';
+  }
+  os << ']';
+  return s;
+}
+
+namespace FormatterBytecode {
+
+/// Implement the @format function.
+static llvm::Error FormatImpl(DataStack &data) {
+  auto fmt = data.Pop<std::string>();
+  auto replacements =
+      llvm::formatv_object_base::parseFormatString(fmt, 0, false);
+  std::string s;
+  llvm::raw_string_ostream os(s);
+  unsigned num_args = 0;
+  for (const auto &r : replacements)
+    if (r.Type == llvm::ReplacementType::Format)
+      num_args = std::max(num_args, r.Index);
+
+  if (data.size() < num_args)
+    return llvm::createStringError("not enough arguments");
+
+  for (const auto &r : replacements) {
+    if (r.Type == llvm::ReplacementType::Literal) {
+      os << r.Spec;
+      continue;
+    }
+    using namespace llvm::support::detail;
+    auto arg = data[data.size() - num_args + r.Index];
+    auto format = [&](format_adapter &&adapter) {
+      llvm::FmtAlign Align(adapter, r.Where, r.Width, r.Pad);
+      Align.format(os, r.Options);
+    };
+
+    if (auto s = std::get_if<std::string>(&arg))
+      format(build_format_adapter(s));
+    else if (auto u = std::get_if<uint64_t>(&arg))
+      format(build_format_adapter(u));
+    else if (auto i = std::get_if<int64_t>(&arg))
+      format(build_format_adapter(i));
+    else if (auto valobj = std::get_if<ValueObjectSP>(&arg)) {
+      if (!valobj->get())
+        format(build_format_adapter("null object"));
+      else
+        format(build_format_adapter(valobj->get()->GetValueAsCString()));
+    } else if (auto type = std::get_if<CompilerType>(&arg))
+      format(build_format_adapter(type->GetDisplayTypeName()));
+    else if (auto sel = std::get_if<FormatterBytecode::Selectors>(&arg))
+      format(build_format_adapter(toString(*sel)));
+  }
+  data.Push(s);
+  return llvm::Error::success();
+}
+
+static llvm::Error TypeCheck(llvm::ArrayRef<DataStackElement> data,
+                             DataType type) {
+  if (data.size() < 1)
+    return llvm::createStringError("not enough elements on data stack");
+
+  auto &elem = data.back();
+  switch (type) {
+  case Any:
+    break;
+  case String:
+    if (!std::holds_alternative<std::string>(elem))
+      return llvm::createStringError("expected String");
+    break;
+  case UInt:
+    if (!std::holds_alternative<uint64_t>(elem))
+      return llvm::createStringError("expected UInt");
+    break;
+  case Int:
+    if (!std::holds_alternative<int64_t>(elem))
+      return llvm::createStringError("expected Int");
+    break;
+  case Object:
+    if (!std::holds_alternative<ValueObjectSP>(elem))
+      return llvm::createStringError("expected Object");
+    break;
+  case Type:
+    if (!std::holds_alternative<CompilerType>(elem))
+      return llvm::createStringError("expected Type");
+    break;
+  case Selector:
+    if (!std::holds_alternative<Selectors>(elem))
+      return llvm::createStringError("expected Selector");
+    break;
+  }
+  return llvm::Error::success();
+}
+
+static llvm::Error TypeCheck(llvm::ArrayRef<DataStackElement> data,
+                             DataType type1, DataType type2) {
+  if (auto error = TypeCheck(data, type2))
+    return error;
+  return TypeCheck(data.drop_back(), type1);
+}
+
+static llvm::Error TypeCheck(llvm::ArrayRef<DataStackElement> data,
+                             DataType type1, DataType type2, DataType type3) {
+  if (auto error = TypeCheck(data, type3))
+    return error;
+  return TypeCheck(data.drop_back(1), type2, type1);
+}
+
+llvm::Error Interpret(std::vector<ControlStackElement> &control,
+                      DataStack &data, Selectors sel) {
+  if (control.empty())
+    return llvm::Error::success();
+  // Since the only data types are single endian and ULEBs, the
+  // endianness should not matter.
+  llvm::DataExtractor cur_block(control.back(), true, 64);
+  llvm::DataExtractor::Cursor pc(0);
+
+  while (!control.empty()) {
+    /// Activate the top most block from the control stack.
+    auto activate_block = [&]() {
+      // Save the return address.
+      if (control.size() > 1)
+        control[control.size() - 2] = cur_block.getData().drop_front(pc.tell());
+      cur_block = llvm::DataExtractor(control.back(), true, 64);
+      if (pc)
+        pc = llvm::DataExtractor::Cursor(0);
+    };
+
+    /// Fetch the next byte in the instruction stream.
+    auto next_byte = [&]() -> uint8_t {
+      // At the end of the current block?
+      while (pc.tell() >= cur_block.size() && !control.empty()) {
+        if (control.size() == 1) {
+          control.pop_back();
+          return 0;
+        }
+        control.pop_back();
+        activate_block();
+      }
+
+      // Fetch the next instruction.
+      return cur_block.getU8(pc);
+    };
+
+    // Fetch the next opcode.
+    OpCodes opcode = (OpCodes)next_byte();
+    if (control.empty() || !pc)
+      return pc.takeError();
+
+    LLDB_LOGV(GetLog(LLDBLog::DataFormatters),
+              "[eval {0}] opcode={1}, control={2}, data={3}", toString(sel),
+              toString(opcode), control.size(), toString(data));
+
+
+    // Various shorthands to improve the readability of error handling.
+#define TYPE_CHECK(...)                                                        \
+  if (auto error = TypeCheck(data, __VA_ARGS__))                               \
+    return error;
+
+    auto error = [&](const char *msg) {
+      return llvm::createStringError("{0} (opcode={1})", msg, toString(opcode).c_str());
+    };
+
+    switch (opcode) {
+    // Data stack manipulation.
+    case op_dup:
+      TYPE_CHECK(Any);
+      data.Push(data.back());
+      break;
+    case op_drop:
+      TYPE_CHECK(Any);
+      data.pop_back();
+      break;
+    case op_pick: {
+      TYPE_CHECK(UInt);
+      uint64_t idx = data.Pop<uint64_t>();
+      if (idx >= data.size())
+        return error("index out of bounds");
+      data.Push(data[idx]);
+      break;
+    }
+    case op_over:
+      TYPE_CHECK(Any, Any);
+      data.Push(data[data.size() - 2]);
+      break;
+    case op_swap: {
+      TYPE_CHECK(Any, Any);
+      auto x = data.PopAny();
+      auto y = data.PopAny();
+      data.Push(x);
+      data.Push(y);
+      break;
+    }
+    case op_rot: {
+      TYPE_CHECK(Any, Any, Any);
+      auto z = data.PopAny();
+      auto y = data.PopAny();
+      auto x = data.PopAny();
+      data.Push(z);
+      data.Push(x);
+      data.Push(y);
+      break;
+    }
+      // Control stack manipulation.
+    case op_begin: {
+      uint64_t length = cur_block.getULEB128(pc);
+      if (!pc)
+        return pc.takeError();
+      llvm::StringRef block = cur_block.getBytes(pc, length);
+      if (!pc)
+        return pc.takeError();
+      control.push_back(block);
+      break;
+    }
+    case op_if:
+      TYPE_CHECK(UInt);
+      if (data.Pop<uint64_t>() != 0) {
+        if (!cur_block.size())
+          return error("empty control stack");
+        activate_block();
+      }
+      break;
+    case op_ifelse:
+      TYPE_CHECK(UInt);
+      if (cur_block.size() < 2)
+        return error("empty control stack");
+      if (data.Pop<uint64_t>() == 0)
+        control[control.size()-2] = control.back();
+      control.pop_back();
+      activate_block();
+      break;
+      // Literals.
+    case op_lit_uint:
+      data.Push(cur_block.getULEB128(pc));
+      break;
+    case op_lit_int:
+      data.Push(cur_block.getSLEB128(pc));
+      break;
+    case op_lit_selector:
+      data.Push(Selectors(cur_block.getU8(pc)));
+      break;
+    case op_lit_string: {
+      uint64_t length = cur_block.getULEB128(pc);
+      llvm::StringRef bytes = cur_block.getBytes(pc, length);
+      data.Push(bytes.str());
+      break;
+    }
+    case op_as_uint: {
+      TYPE_CHECK(Int);
+      uint64_t casted;
+      int64_t val = data.Pop<int64_t>();
+      memcpy(&casted, &val, sizeof(val));
+      data.Push(casted);
+      break;
+    }
+    case op_as_int: {
+      TYPE_CHECK(UInt);
+      int64_t casted;
+      uint64_t val = data.Pop<uint64_t>();
+      memcpy(&casted, &val, sizeof(val));
+      data.Push(casted);
+      break;
+    }
+    case op_is_null: {
+      TYPE_CHECK(Object);
+      data.Push(data.Pop<ValueObjectSP>() ? 0ULL : 1ULL);
+      break;
+    }
+    // Arithmetic, logic, etc.
+#define BINOP_IMPL(OP, CHECK_ZERO)                                             \
+  {                                                                            \
+    TYPE_CHECK(Any, Any);                                                      \
+    auto y = data.PopAny();                                                    \
+    if (std::holds_alternative<uint64_t>(y)) {                                 \
+      if (CHECK_ZERO && !std::get<uint64_t>(y))                                \
+        return error(#OP " by zero");                                          \
+      TYPE_CHECK(UInt);                                                        \
+      data.Push((uint64_t)(data.Pop<uint64_t>() OP std::get<uint64_t>(y)));    \
+    } else if (std::holds_alternative<int64_t>(y)) {                           \
+      if (CHECK_ZERO && !std::get<int64_t>(y))                                 \
+        return error(#OP " by zero");                                          \
+      TYPE_CHECK(Int);                                                         \
+      data.Push((int64_t)(data.Pop<int64_t>() OP std::get<int64_t>(y)));       \
+    } else                                                                     \
+      return error("unsupported data types");                                  \
+  }
+#define BINOP(OP) BINOP_IMPL(OP, false)
+#define BINOP_CHECKZERO(OP) BINOP_IMPL(OP, true)
+    case op_plus:
+      BINOP(+);
+      break;
+    case op_minus:
+      BINOP(-);
+      break;
+    case op_mul:
+      BINOP(*);
+      break;
+    case op_div:
+      BINOP_CHECKZERO(/);
+      break;
+    case op_mod:
+      BINOP_CHECKZERO(%);
+      break;
+    case op_shl:
+#define SHIFTOP(OP)                                                            \
+  {                                                                            \
+    TYPE_CHECK(Any, Any);                                                      \
+    if (std::holds_alternative<uint64_t>(data.back())) {                       \
+      uint64_t y = data.Pop<uint64_t>();                                       \
+      TYPE_CHECK(UInt);                                                        \
+      uint64_t x = data.Pop<uint64_t>();                                       \
+      if (y > 64)                                                              \
+        return error("shift out of bounds");                                   \
+      data.Push(x OP y);                                                       \
+    } else if (std::holds_alternative<int64_t>(data.back())) {                 \
+      uint64_t y = data.Pop<int64_t>();                                        \
+      TYPE_CHECK(Int);                                                         \
+      uint64_t x = data.Pop<int64_t>();                                        \
+      if (y > 64)                                                              \
+        return error("shift out of bounds");                                   \
+      if (y < 0)                                                               \
+        return error("shift out of bounds");                                   \
+      data.Push(x OP y);                                                       \
+    } else                                                                     \
+      return error("unsupported data types");                                  \
+  }
+      SHIFTOP(<<);
+      break;
+    case op_shr:
+      SHIFTOP(<<);
+      break;
+    case op_and:
+      BINOP(&);
+      break;
+    case op_or:
+      BINOP(|);
+      break;
+    case op_xor:
+      BINOP(^);
+      break;
+    case op_not:
+      TYPE_CHECK(UInt);
+      data.Push(~data.Pop<uint64_t>());
+      break;
+    case op_eq:
+      BINOP(==);
+      break;
+    case op_neq:
+      BINOP(!=);
+      break;
+    case op_lt:
+      BINOP(<);
+      break;
+    case op_gt:
+      BINOP(>);
+      break;
+    case op_le:
+      BINOP(<=);
+      break;
+    case op_ge:
+      BINOP(>=);
+      break;
+    case op_call: {
+      TYPE_CHECK(Selector);
+      Selectors sel = data.Pop<Selectors>();
+
+      // Shorthand to improve readability.
+#define POP_VALOBJ(VALOBJ)                                                     \
+  auto VALOBJ = data.Pop<ValueObjectSP>();                                     \
+  if (!VALOBJ)                                                                 \
+    return error("null object");
+
+      auto sel_error = [&](const char *msg) {
+        return llvm::createStringError("{0} (opcode={1}, selector={2})", msg,
+                                       toString(opcode).c_str(),
+                                       toString(sel).c_str());
+      };
+
+      switch (sel) {
+      case sel_summary: {
+        TYPE_CHECK(Object);
+        POP_VALOBJ(valobj);
+        const char *summary = valobj->GetSummaryAsCString();
+        data.Push(summary ? std::string(valobj->GetSummaryAsCString())
+                          : std::string());
+        break;
+      }
+      case sel_get_num_children: {
+        TYPE_CHECK(Object);
+        POP_VALOBJ(valobj);
+        auto result = valobj->GetNumChildren();
+        if (!result)
+          return result.takeError();
+        data.Push((uint64_t)*result);
+        break;
+      }
+      case sel_get_child_at_index: {
+        TYPE_CHECK(Object, UInt);
+        auto index = data.Pop<uint64_t>();
+        POP_VALOBJ(valobj);
+        data.Push(valobj->GetChildAtIndex(index));
+        break;
+      }
+      case sel_get_child_with_name: {
+        TYPE_CHECK(Object, String);
+        auto name = data.Pop<std::string>();
+        POP_VALOBJ(valobj);
+        data.Push(valobj->GetChildMemberWithName(name));
+        break;
+      }
+      c...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/114333