[Lldb-commits] [lldb] [lldb][WIP] dump riscv opcode bytes as blocks of 2/4 bytes (PR #148105)

Thu Jul 10 23:22:39 PDT 2025

https://github.com/jasonmolenda updated https://github.com/llvm/llvm-project/pull/148105

>From 74995e8f1a982a82bafaa150fb407187fa4b95e4 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 10 Jul 2025 22:31:54 -0700
Subject: [PATCH 1/4] [lldb][WIP] dump riscv opcode bytes as blocks of 2/4
 bytes

Part of the changes by Ted Woodward et al in
https://github.com/llvm/llvm-project/pull/145793
to change the riscv disassemble --bytes mode output
to match llvm-objdump prints the instruction bytes as
sequences of 2/4 bytes for instructions that may be
2/4/6/8/etc bytes long, instead of printing them as
an array of UInt8's like x86 as is done today.
This PR is doing it by adding a special Dump method
to Opcode, but I believe the correct way to accomplish
this is to have an Opcode::Type that implies this formatting,
and not adding a DumpRISCV() method.

I put up this PR to show what I meant by this.

The unit test I added fails on the 64-bit example instruction,
MCInst seems to be decoding this as a 2-byte instruction.  I'm not
sure if I copied this hypothetical instruction from the original
PR incorrectly, or if they might be working on a downstream branch
that can decode this, it seems to work in their test cases.
---
 lldb/include/lldb/Core/Opcode.h               | 39 ++++++++++---
 lldb/source/Core/Opcode.cpp                   | 38 +++++++++++-
 .../Disassembler/LLVMC/DisassemblerLLVMC.cpp  | 58 +++++++++++--------
 lldb/source/Utility/ArchSpec.cpp              |  4 +-
 .../RISCV/TestMCDisasmInstanceRISCV.cpp       | 46 +++++++++++++++
 5 files changed, 152 insertions(+), 33 deletions(-)

diff --git a/lldb/include/lldb/Core/Opcode.h b/lldb/include/lldb/Core/Opcode.h
index f72f2687b54fe..91af15c62e6ab 100644
--- a/lldb/include/lldb/Core/Opcode.h
+++ b/lldb/include/lldb/Core/Opcode.h
@@ -32,7 +32,10 @@ class Opcode {
     eTypeInvalid,
     eType8,
     eType16,
-    eType16_2, // a 32-bit Thumb instruction, made up of two words
+    eType16_2,        // a 32-bit Thumb instruction, made up of two words
+    eType16_32Tuples, // RISC-V that can have 2, 4, 6, 8 etc byte long
+                      // instructions which will be printed in combinations of
+                      // 16 & 32-bit words.
     eType32,
     eType64,
     eTypeBytes
@@ -60,9 +63,9 @@ class Opcode {
     m_data.inst64 = inst;
   }
 
-  Opcode(uint8_t *bytes, size_t length)
-      : m_byte_order(lldb::eByteOrderInvalid) {
-    SetOpcodeBytes(bytes, length);
+  Opcode(uint8_t *bytes, size_t length, Opcode::Type type,
+         lldb::ByteOrder order) {
+    DoSetOpcodeBytes(bytes, length, type, order);
   }
 
   void Clear() {
@@ -82,6 +85,8 @@ class Opcode {
       break;
     case Opcode::eType16_2:
       break;
+    case Opcode::eType16_32Tuples:
+      break;
     case Opcode::eType32:
       break;
     case Opcode::eType64:
@@ -103,6 +108,8 @@ class Opcode {
                              : m_data.inst16;
     case Opcode::eType16_2:
       break;
+    case Opcode::eType16_32Tuples:
+      break;
     case Opcode::eType32:
       break;
     case Opcode::eType64:
@@ -122,6 +129,8 @@ class Opcode {
     case Opcode::eType16:
       return GetEndianSwap() ? llvm::byteswap<uint16_t>(m_data.inst16)
                              : m_data.inst16;
+    case Opcode::eType16_32Tuples:
+      break;
     case Opcode::eType16_2: // passthrough
     case Opcode::eType32:
       return GetEndianSwap() ? llvm::byteswap<uint32_t>(m_data.inst32)
@@ -143,6 +152,8 @@ class Opcode {
     case Opcode::eType16:
       return GetEndianSwap() ? llvm::byteswap<uint16_t>(m_data.inst16)
                              : m_data.inst16;
+    case Opcode::eType16_32Tuples:
+      break;
     case Opcode::eType16_2: // passthrough
     case Opcode::eType32:
       return GetEndianSwap() ? llvm::byteswap<uint32_t>(m_data.inst32)
@@ -186,20 +197,30 @@ class Opcode {
     m_byte_order = order;
   }
 
+  void SetOpcode16_32TupleBytes(const void *bytes, size_t length,
+                                lldb::ByteOrder order) {
+    DoSetOpcodeBytes(bytes, length, eType16_32Tuples, order);
+  }
+
   void SetOpcodeBytes(const void *bytes, size_t length) {
+    DoSetOpcodeBytes(bytes, length, eTypeBytes, lldb::eByteOrderInvalid);
+  }
+
+  void DoSetOpcodeBytes(const void *bytes, size_t length, Opcode::Type type,
+                        lldb::ByteOrder order) {
     if (bytes != nullptr && length > 0) {
-      m_type = eTypeBytes;
+      m_type = type;
       m_data.inst.length = length;
       assert(length < sizeof(m_data.inst.bytes));
       memcpy(m_data.inst.bytes, bytes, length);
-      m_byte_order = lldb::eByteOrderInvalid;
+      m_byte_order = order;
     } else {
       m_type = eTypeInvalid;
       m_data.inst.length = 0;
     }
   }
 
-  int Dump(Stream *s, uint32_t min_byte_width);
+  int Dump(Stream *s, uint32_t min_byte_width) const;
 
   const void *GetOpcodeBytes() const {
     return ((m_type == Opcode::eTypeBytes) ? m_data.inst.bytes : nullptr);
@@ -213,6 +234,8 @@ class Opcode {
       return sizeof(m_data.inst8);
     case Opcode::eType16:
       return sizeof(m_data.inst16);
+    case Opcode::eType16_32Tuples:
+      return m_data.inst.length;
     case Opcode::eType16_2: // passthrough
     case Opcode::eType32:
       return sizeof(m_data.inst32);
@@ -238,6 +261,8 @@ class Opcode {
       return &m_data.inst8;
     case Opcode::eType16:
       return &m_data.inst16;
+    case Opcode::eType16_32Tuples:
+      return m_data.inst.bytes;
     case Opcode::eType16_2: // passthrough
     case Opcode::eType32:
       return &m_data.inst32;
diff --git a/lldb/source/Core/Opcode.cpp b/lldb/source/Core/Opcode.cpp
index 3e30d98975d8a..15ae87ea99f9f 100644
--- a/lldb/source/Core/Opcode.cpp
+++ b/lldb/source/Core/Opcode.cpp
@@ -21,7 +21,7 @@
 using namespace lldb;
 using namespace lldb_private;
 
-int Opcode::Dump(Stream *s, uint32_t min_byte_width) {
+int Opcode::Dump(Stream *s, uint32_t min_byte_width) const {
   const uint32_t previous_bytes = s->GetWrittenBytes();
   switch (m_type) {
   case Opcode::eTypeInvalid:
@@ -38,6 +38,38 @@ int Opcode::Dump(Stream *s, uint32_t min_byte_width) {
     s->Printf("0x%8.8x", m_data.inst32);
     break;
 
+  case Opcode::eType16_32Tuples: {
+    uint8_t buf[m_data.inst.length];
+    // Swap the byte order if lldb and the
+    // target are different endian.  May be
+    // any of 2/4/6/8/etc bytes long, easiest
+    // to reverse the bytes manually.
+    if (GetEndianSwap())
+      for (int i = 0; i < m_data.inst.length; i++)
+        buf[m_data.inst.length - i] = m_data.inst.bytes[i];
+    else
+      memcpy(buf, m_data.inst.bytes, m_data.inst.length);
+    uint32_t i = 0;
+    while (i < m_data.inst.length) {
+      if (i > 0)
+        s->PutChar(' ');
+      // if size % 4 == 0, print as 1 or 2 32 bit values (32 or 64 bit inst)
+      if (!(m_data.inst.length % 4)) {
+        uint32_t value;
+        memcpy(&value, &buf[i], 4);
+        s->Printf("%8.8x", value);
+        i += 4;
+        // else if size % 2 == 0, print as 1 or 3 16 bit values (16 or 48 bit
+        // inst)
+      } else if (!(m_data.inst.length % 2)) {
+        uint16_t value;
+        memcpy(&value, &buf[i], 2);
+        s->Printf("%4.4x", value);
+        i += 2;
+      }
+    }
+  } break;
+
   case Opcode::eType64:
     s->Printf("0x%16.16" PRIx64, m_data.inst64);
     break;
@@ -69,6 +101,7 @@ lldb::ByteOrder Opcode::GetDataByteOrder() const {
   case Opcode::eType8:
   case Opcode::eType16:
   case Opcode::eType16_2:
+  case Opcode::eType16_32Tuples:
   case Opcode::eType32:
   case Opcode::eType64:
     return endian::InlHostByteOrder();
@@ -113,6 +146,9 @@ uint32_t Opcode::GetData(DataExtractor &data) const {
         swap_buf[3] = m_data.inst.bytes[2];
         buf = swap_buf;
         break;
+      case Opcode::eType16_32Tuples:
+        buf = GetOpcodeDataBytes();
+        break;
       case Opcode::eType32:
         *(uint32_t *)swap_buf = llvm::byteswap<uint32_t>(m_data.inst32);
         buf = swap_buf;
diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
index ed6047f8f4ef3..0a8caf54ef1a0 100644
--- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
+++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
@@ -61,6 +61,8 @@ class DisassemblerLLVMC::MCDisasmInstance {
 
   uint64_t GetMCInst(const uint8_t *opcode_data, size_t opcode_data_len,
                      lldb::addr_t pc, llvm::MCInst &mc_inst) const;
+  bool GetMCInst(const uint8_t *opcode_data, size_t opcode_data_len,
+                 lldb::addr_t pc, llvm::MCInst &mc_inst, uint64_t &size) const;
   void PrintMCInst(llvm::MCInst &mc_inst, lldb::addr_t pc,
                    std::string &inst_string, std::string &comments_string);
   void SetStyle(bool use_hex_immed, HexImmediateStyle hex_style);
@@ -486,8 +488,13 @@ class InstructionLLVMC : public lldb_private::Instruction {
           break;
 
         default:
-          m_opcode.SetOpcodeBytes(data.PeekData(data_offset, min_op_byte_size),
-                                  min_op_byte_size);
+          if (arch.GetTriple().isRISCV())
+            m_opcode.SetOpcode16_32TupleBytes(
+                data.PeekData(data_offset, min_op_byte_size), min_op_byte_size,
+                byte_order);
+          else
+            m_opcode.SetOpcodeBytes(
+                data.PeekData(data_offset, min_op_byte_size), min_op_byte_size);
           got_op = true;
           break;
         }
@@ -524,13 +531,16 @@ class InstructionLLVMC : public lldb_private::Instruction {
           const addr_t pc = m_address.GetFileAddress();
           llvm::MCInst inst;
 
-          const size_t inst_size =
-              mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
-          if (inst_size == 0)
-            m_opcode.Clear();
-          else {
-            m_opcode.SetOpcodeBytes(opcode_data, inst_size);
-            m_is_valid = true;
+          uint64_t inst_size = 0;
+          m_is_valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len,
+                                                pc, inst, inst_size);
+          m_opcode.Clear();
+          if (inst_size != 0) {
+            if (arch.GetTriple().isRISCV())
+              m_opcode.SetOpcode16_32TupleBytes(opcode_data, inst_size,
+                                                byte_order);
+            else
+              m_opcode.SetOpcodeBytes(opcode_data, inst_size);
           }
         }
       }
@@ -604,10 +614,11 @@ class InstructionLLVMC : public lldb_private::Instruction {
         const uint8_t *opcode_data = data.GetDataStart();
         const size_t opcode_data_len = data.GetByteSize();
         llvm::MCInst inst;
-        size_t inst_size =
-            mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
+        uint64_t inst_size = 0;
+        bool valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc,
+                                              inst, inst_size);
 
-        if (inst_size > 0) {
+        if (valid && inst_size > 0) {
           mc_disasm_ptr->SetStyle(use_hex_immediates, hex_style);
 
           const bool saved_use_color = mc_disasm_ptr->GetUseColor();
@@ -1206,9 +1217,10 @@ class InstructionLLVMC : public lldb_private::Instruction {
     const uint8_t *opcode_data = data.GetDataStart();
     const size_t opcode_data_len = data.GetByteSize();
     llvm::MCInst inst;
-    const size_t inst_size =
-        mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
-    if (inst_size == 0)
+    uint64_t inst_size = 0;
+    const bool valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len,
+                                                pc, inst, inst_size);
+    if (!valid)
       return;
 
     m_has_visited_instruction = true;
@@ -1337,19 +1349,19 @@ DisassemblerLLVMC::MCDisasmInstance::MCDisasmInstance(
          m_asm_info_up && m_context_up && m_disasm_up && m_instr_printer_up);
 }
 
-uint64_t DisassemblerLLVMC::MCDisasmInstance::GetMCInst(
-    const uint8_t *opcode_data, size_t opcode_data_len, lldb::addr_t pc,
-    llvm::MCInst &mc_inst) const {
+bool DisassemblerLLVMC::MCDisasmInstance::GetMCInst(const uint8_t *opcode_data,
+                                                    size_t opcode_data_len,
+                                                    lldb::addr_t pc,
+                                                    llvm::MCInst &mc_inst,
+                                                    uint64_t &size) const {
   llvm::ArrayRef<uint8_t> data(opcode_data, opcode_data_len);
   llvm::MCDisassembler::DecodeStatus status;
 
-  uint64_t new_inst_size;
-  status = m_disasm_up->getInstruction(mc_inst, new_inst_size, data, pc,
-                                       llvm::nulls());
+  status = m_disasm_up->getInstruction(mc_inst, size, data, pc, llvm::nulls());
   if (status == llvm::MCDisassembler::Success)
-    return new_inst_size;
+    return true;
   else
-    return 0;
+    return false;
 }
 
 void DisassemblerLLVMC::MCDisasmInstance::PrintMCInst(
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 70b9800f4dade..7c71aaae6bcf2 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -228,9 +228,9 @@ static const CoreDefinition g_core_definitions[] = {
     {eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon,
      ArchSpec::eCore_hexagon_hexagonv5, "hexagonv5"},
 
-    {eByteOrderLittle, 4, 2, 4, llvm::Triple::riscv32, ArchSpec::eCore_riscv32,
+    {eByteOrderLittle, 4, 2, 8, llvm::Triple::riscv32, ArchSpec::eCore_riscv32,
      "riscv32"},
-    {eByteOrderLittle, 8, 2, 4, llvm::Triple::riscv64, ArchSpec::eCore_riscv64,
+    {eByteOrderLittle, 8, 2, 8, llvm::Triple::riscv64, ArchSpec::eCore_riscv64,
      "riscv64"},
 
     {eByteOrderLittle, 4, 4, 4, llvm::Triple::loongarch32,
diff --git a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
index 8ec5d62a99ac5..51487034d89cd 100644
--- a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
+++ b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
@@ -14,6 +14,7 @@
 #include "lldb/Core/Disassembler.h"
 #include "lldb/Target/ExecutionContext.h"
 #include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/StreamString.h"
 
 #include "Plugins/Disassembler/LLVMC/DisassemblerLLVMC.h"
 
@@ -90,3 +91,48 @@ TEST_F(TestMCDisasmInstanceRISCV, TestRISCV32Instruction) {
   EXPECT_FALSE(inst_sp->IsCall());
   EXPECT_TRUE(inst_sp->DoesBranch());
 }
+
+TEST_F(TestMCDisasmInstanceRISCV, TestOpcodeBytePrinter) {
+  ArchSpec arch("riscv32-*-linux");
+
+  const unsigned num_of_instructions = 7;
+  uint8_t data[] = {
+      0x41, 0x11,                               // addi   sp, sp, -0x10
+      0x06, 0xc6,                               // sw     ra, 0xc(sp)
+      0x23, 0x2a, 0xa4, 0xfe,                   // sw     a0, -0xc(s0)
+      0x23, 0x28, 0xa4, 0xfe,                   // sw     a0, -0x10(s0)
+      0x22, 0x44,                               // lw     s0, 0x8(sp)
+      0x20, 0x00, 0x20, 0x09, 0x40, 0x00, 0x3F, // 64 bit custom instruction
+      0x10, 0x00, 0x00, 0x00, 0x02, 0x1f // 48 bit xqci.e.li rd=8 imm=0x1000
+  };
+
+  const char *expected_outputs[] = {"1141",          "c606",
+                                    "fea42a23",      "fea42823",
+                                    "4422",          "0940003f 00200020",
+                                    "021f 0000 1000"};
+
+  EXPECT_EQ(num_of_instructions, sizeof(expected_outputs) / sizeof(char *));
+
+  DisassemblerSP disass_sp;
+  Address start_addr(0x100);
+  disass_sp = Disassembler::DisassembleBytes(
+      arch, nullptr, nullptr, nullptr, nullptr, start_addr, &data, sizeof(data),
+      num_of_instructions, false);
+
+  // If we failed to get a disassembler, we can assume it is because
+  // the llvm we linked against was not built with the riscv target,
+  // and we should skip these tests without marking anything as failing.
+  if (!disass_sp)
+    return;
+
+  const InstructionList inst_list(disass_sp->GetInstructionList());
+  EXPECT_EQ(num_of_instructions, inst_list.GetSize());
+
+  for (size_t i = 0; i < sizeof(expected_outputs) / sizeof(char *); i++) {
+    InstructionSP inst_sp;
+    StreamString s;
+    inst_sp = inst_list.GetInstructionAtIndex(i);
+    inst_sp->GetOpcode().Dump(&s, 1);
+    ASSERT_STREQ(s.GetString().str().c_str(), expected_outputs[i]);
+  }
+}

>From 8880121940ded4b535284911c23fa387aabd16d1 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 10 Jul 2025 22:46:55 -0700
Subject: [PATCH 2/4] un-clang-formatify

---
 .../RISCV/TestMCDisasmInstanceRISCV.cpp           | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
index 51487034d89cd..de4de32145e09 100644
--- a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
+++ b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
@@ -106,10 +106,17 @@ TEST_F(TestMCDisasmInstanceRISCV, TestOpcodeBytePrinter) {
       0x10, 0x00, 0x00, 0x00, 0x02, 0x1f // 48 bit xqci.e.li rd=8 imm=0x1000
   };
 
-  const char *expected_outputs[] = {"1141",          "c606",
-                                    "fea42a23",      "fea42823",
-                                    "4422",          "0940003f 00200020",
-                                    "021f 0000 1000"};
+  // clang-format off
+  const char *expected_outputs[] = {
+    "1141",
+    "c606",
+    "fea42a23",
+    "fea42823",
+    "4422",
+    "0940003f 00200020",
+    "021f 0000 1000"
+  };
+  // clang-format on
 
   EXPECT_EQ(num_of_instructions, sizeof(expected_outputs) / sizeof(char *));
 

>From 5825ddfe556f424a884d792787ca8ad13d3aa7dd Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 10 Jul 2025 22:50:38 -0700
Subject: [PATCH 3/4] fix typeo

---
 lldb/source/Core/Opcode.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/source/Core/Opcode.cpp b/lldb/source/Core/Opcode.cpp
index 15ae87ea99f9f..34319dc5b0a93 100644
--- a/lldb/source/Core/Opcode.cpp
+++ b/lldb/source/Core/Opcode.cpp
@@ -46,7 +46,7 @@ int Opcode::Dump(Stream *s, uint32_t min_byte_width) const {
     // to reverse the bytes manually.
     if (GetEndianSwap())
       for (int i = 0; i < m_data.inst.length; i++)
-        buf[m_data.inst.length - i] = m_data.inst.bytes[i];
+        buf[m_data.inst.length - i - 1] = m_data.inst.bytes[i];
     else
       memcpy(buf, m_data.inst.bytes, m_data.inst.length);
     uint32_t i = 0;

>From 6a775cbac6a0246a3514e0a2cde700e7363461f7 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 10 Jul 2025 23:22:18 -0700
Subject: [PATCH 4/4] clarify dump method a bit more.

---
 lldb/source/Core/Opcode.cpp | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Core/Opcode.cpp b/lldb/source/Core/Opcode.cpp
index 34319dc5b0a93..d5cf12245a8dc 100644
--- a/lldb/source/Core/Opcode.cpp
+++ b/lldb/source/Core/Opcode.cpp
@@ -49,19 +49,20 @@ int Opcode::Dump(Stream *s, uint32_t min_byte_width) const {
         buf[m_data.inst.length - i - 1] = m_data.inst.bytes[i];
     else
       memcpy(buf, m_data.inst.bytes, m_data.inst.length);
+
+    const bool format_as_words = !(m_data.inst.length % 4);
     uint32_t i = 0;
     while (i < m_data.inst.length) {
       if (i > 0)
         s->PutChar(' ');
-      // if size % 4 == 0, print as 1 or 2 32 bit values (32 or 64 bit inst)
-      if (!(m_data.inst.length % 4)) {
+      if (format_as_words) {
+        // Format as words; print 1 or more UInt32 values.
         uint32_t value;
         memcpy(&value, &buf[i], 4);
         s->Printf("%8.8x", value);
         i += 4;
-        // else if size % 2 == 0, print as 1 or 3 16 bit values (16 or 48 bit
-        // inst)
-      } else if (!(m_data.inst.length % 2)) {
+      } else {
+        // Format as halfwords; print 1 or more UInt16 values.
         uint16_t value;
         memcpy(&value, &buf[i], 2);
         s->Printf("%4.4x", value);