[Lldb-commits] [lldb] [lldb][WIP] dump riscv opcode bytes as blocks of 2/4 bytes (PR #148105)
Jason Molenda via lldb-commits
lldb-commits at lists.llvm.org
Thu Jul 10 23:22:39 PDT 2025
https://github.com/jasonmolenda updated https://github.com/llvm/llvm-project/pull/148105
>From 74995e8f1a982a82bafaa150fb407187fa4b95e4 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 10 Jul 2025 22:31:54 -0700
Subject: [PATCH 1/4] [lldb][WIP] dump riscv opcode bytes as blocks of 2/4
bytes
Part of the changes by Ted Woodward et al in
https://github.com/llvm/llvm-project/pull/145793
to change the riscv disassemble --bytes mode output
to match llvm-objdump prints the instruction bytes as
sequences of 2/4 bytes for instructions that may be
2/4/6/8/etc bytes long, instead of printing them as
an array of UInt8's like x86 as is done today.
This PR is doing it by adding a special Dump method
to Opcode, but I believe the correct way to accomplish
this is to have an Opcode::Type that implies this formatting,
and not adding a DumpRISCV() method.
I put up this PR to show what I meant by this.
The unit test I added fails on the 64-bit example instruction,
MCInst seems to be decoding this as a 2-byte instruction. I'm not
sure if I copied this hypothetical instruction from the original
PR incorrectly, or if they might be working on a downstream branch
that can decode this, it seems to work in their test cases.
---
lldb/include/lldb/Core/Opcode.h | 39 ++++++++++---
lldb/source/Core/Opcode.cpp | 38 +++++++++++-
.../Disassembler/LLVMC/DisassemblerLLVMC.cpp | 58 +++++++++++--------
lldb/source/Utility/ArchSpec.cpp | 4 +-
.../RISCV/TestMCDisasmInstanceRISCV.cpp | 46 +++++++++++++++
5 files changed, 152 insertions(+), 33 deletions(-)
diff --git a/lldb/include/lldb/Core/Opcode.h b/lldb/include/lldb/Core/Opcode.h
index f72f2687b54fe..91af15c62e6ab 100644
--- a/lldb/include/lldb/Core/Opcode.h
+++ b/lldb/include/lldb/Core/Opcode.h
@@ -32,7 +32,10 @@ class Opcode {
eTypeInvalid,
eType8,
eType16,
- eType16_2, // a 32-bit Thumb instruction, made up of two words
+ eType16_2, // a 32-bit Thumb instruction, made up of two words
+ eType16_32Tuples, // RISC-V that can have 2, 4, 6, 8 etc byte long
+ // instructions which will be printed in combinations of
+ // 16 & 32-bit words.
eType32,
eType64,
eTypeBytes
@@ -60,9 +63,9 @@ class Opcode {
m_data.inst64 = inst;
}
- Opcode(uint8_t *bytes, size_t length)
- : m_byte_order(lldb::eByteOrderInvalid) {
- SetOpcodeBytes(bytes, length);
+ Opcode(uint8_t *bytes, size_t length, Opcode::Type type,
+ lldb::ByteOrder order) {
+ DoSetOpcodeBytes(bytes, length, type, order);
}
void Clear() {
@@ -82,6 +85,8 @@ class Opcode {
break;
case Opcode::eType16_2:
break;
+ case Opcode::eType16_32Tuples:
+ break;
case Opcode::eType32:
break;
case Opcode::eType64:
@@ -103,6 +108,8 @@ class Opcode {
: m_data.inst16;
case Opcode::eType16_2:
break;
+ case Opcode::eType16_32Tuples:
+ break;
case Opcode::eType32:
break;
case Opcode::eType64:
@@ -122,6 +129,8 @@ class Opcode {
case Opcode::eType16:
return GetEndianSwap() ? llvm::byteswap<uint16_t>(m_data.inst16)
: m_data.inst16;
+ case Opcode::eType16_32Tuples:
+ break;
case Opcode::eType16_2: // passthrough
case Opcode::eType32:
return GetEndianSwap() ? llvm::byteswap<uint32_t>(m_data.inst32)
@@ -143,6 +152,8 @@ class Opcode {
case Opcode::eType16:
return GetEndianSwap() ? llvm::byteswap<uint16_t>(m_data.inst16)
: m_data.inst16;
+ case Opcode::eType16_32Tuples:
+ break;
case Opcode::eType16_2: // passthrough
case Opcode::eType32:
return GetEndianSwap() ? llvm::byteswap<uint32_t>(m_data.inst32)
@@ -186,20 +197,30 @@ class Opcode {
m_byte_order = order;
}
+ void SetOpcode16_32TupleBytes(const void *bytes, size_t length,
+ lldb::ByteOrder order) {
+ DoSetOpcodeBytes(bytes, length, eType16_32Tuples, order);
+ }
+
void SetOpcodeBytes(const void *bytes, size_t length) {
+ DoSetOpcodeBytes(bytes, length, eTypeBytes, lldb::eByteOrderInvalid);
+ }
+
+ void DoSetOpcodeBytes(const void *bytes, size_t length, Opcode::Type type,
+ lldb::ByteOrder order) {
if (bytes != nullptr && length > 0) {
- m_type = eTypeBytes;
+ m_type = type;
m_data.inst.length = length;
assert(length < sizeof(m_data.inst.bytes));
memcpy(m_data.inst.bytes, bytes, length);
- m_byte_order = lldb::eByteOrderInvalid;
+ m_byte_order = order;
} else {
m_type = eTypeInvalid;
m_data.inst.length = 0;
}
}
- int Dump(Stream *s, uint32_t min_byte_width);
+ int Dump(Stream *s, uint32_t min_byte_width) const;
const void *GetOpcodeBytes() const {
return ((m_type == Opcode::eTypeBytes) ? m_data.inst.bytes : nullptr);
@@ -213,6 +234,8 @@ class Opcode {
return sizeof(m_data.inst8);
case Opcode::eType16:
return sizeof(m_data.inst16);
+ case Opcode::eType16_32Tuples:
+ return m_data.inst.length;
case Opcode::eType16_2: // passthrough
case Opcode::eType32:
return sizeof(m_data.inst32);
@@ -238,6 +261,8 @@ class Opcode {
return &m_data.inst8;
case Opcode::eType16:
return &m_data.inst16;
+ case Opcode::eType16_32Tuples:
+ return m_data.inst.bytes;
case Opcode::eType16_2: // passthrough
case Opcode::eType32:
return &m_data.inst32;
diff --git a/lldb/source/Core/Opcode.cpp b/lldb/source/Core/Opcode.cpp
index 3e30d98975d8a..15ae87ea99f9f 100644
--- a/lldb/source/Core/Opcode.cpp
+++ b/lldb/source/Core/Opcode.cpp
@@ -21,7 +21,7 @@
using namespace lldb;
using namespace lldb_private;
-int Opcode::Dump(Stream *s, uint32_t min_byte_width) {
+int Opcode::Dump(Stream *s, uint32_t min_byte_width) const {
const uint32_t previous_bytes = s->GetWrittenBytes();
switch (m_type) {
case Opcode::eTypeInvalid:
@@ -38,6 +38,38 @@ int Opcode::Dump(Stream *s, uint32_t min_byte_width) {
s->Printf("0x%8.8x", m_data.inst32);
break;
+ case Opcode::eType16_32Tuples: {
+ uint8_t buf[m_data.inst.length];
+ // Swap the byte order if lldb and the
+ // target are different endian. May be
+ // any of 2/4/6/8/etc bytes long, easiest
+ // to reverse the bytes manually.
+ if (GetEndianSwap())
+ for (int i = 0; i < m_data.inst.length; i++)
+ buf[m_data.inst.length - i] = m_data.inst.bytes[i];
+ else
+ memcpy(buf, m_data.inst.bytes, m_data.inst.length);
+ uint32_t i = 0;
+ while (i < m_data.inst.length) {
+ if (i > 0)
+ s->PutChar(' ');
+ // if size % 4 == 0, print as 1 or 2 32 bit values (32 or 64 bit inst)
+ if (!(m_data.inst.length % 4)) {
+ uint32_t value;
+ memcpy(&value, &buf[i], 4);
+ s->Printf("%8.8x", value);
+ i += 4;
+ // else if size % 2 == 0, print as 1 or 3 16 bit values (16 or 48 bit
+ // inst)
+ } else if (!(m_data.inst.length % 2)) {
+ uint16_t value;
+ memcpy(&value, &buf[i], 2);
+ s->Printf("%4.4x", value);
+ i += 2;
+ }
+ }
+ } break;
+
case Opcode::eType64:
s->Printf("0x%16.16" PRIx64, m_data.inst64);
break;
@@ -69,6 +101,7 @@ lldb::ByteOrder Opcode::GetDataByteOrder() const {
case Opcode::eType8:
case Opcode::eType16:
case Opcode::eType16_2:
+ case Opcode::eType16_32Tuples:
case Opcode::eType32:
case Opcode::eType64:
return endian::InlHostByteOrder();
@@ -113,6 +146,9 @@ uint32_t Opcode::GetData(DataExtractor &data) const {
swap_buf[3] = m_data.inst.bytes[2];
buf = swap_buf;
break;
+ case Opcode::eType16_32Tuples:
+ buf = GetOpcodeDataBytes();
+ break;
case Opcode::eType32:
*(uint32_t *)swap_buf = llvm::byteswap<uint32_t>(m_data.inst32);
buf = swap_buf;
diff --git a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
index ed6047f8f4ef3..0a8caf54ef1a0 100644
--- a/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
+++ b/lldb/source/Plugins/Disassembler/LLVMC/DisassemblerLLVMC.cpp
@@ -61,6 +61,8 @@ class DisassemblerLLVMC::MCDisasmInstance {
uint64_t GetMCInst(const uint8_t *opcode_data, size_t opcode_data_len,
lldb::addr_t pc, llvm::MCInst &mc_inst) const;
+ bool GetMCInst(const uint8_t *opcode_data, size_t opcode_data_len,
+ lldb::addr_t pc, llvm::MCInst &mc_inst, uint64_t &size) const;
void PrintMCInst(llvm::MCInst &mc_inst, lldb::addr_t pc,
std::string &inst_string, std::string &comments_string);
void SetStyle(bool use_hex_immed, HexImmediateStyle hex_style);
@@ -486,8 +488,13 @@ class InstructionLLVMC : public lldb_private::Instruction {
break;
default:
- m_opcode.SetOpcodeBytes(data.PeekData(data_offset, min_op_byte_size),
- min_op_byte_size);
+ if (arch.GetTriple().isRISCV())
+ m_opcode.SetOpcode16_32TupleBytes(
+ data.PeekData(data_offset, min_op_byte_size), min_op_byte_size,
+ byte_order);
+ else
+ m_opcode.SetOpcodeBytes(
+ data.PeekData(data_offset, min_op_byte_size), min_op_byte_size);
got_op = true;
break;
}
@@ -524,13 +531,16 @@ class InstructionLLVMC : public lldb_private::Instruction {
const addr_t pc = m_address.GetFileAddress();
llvm::MCInst inst;
- const size_t inst_size =
- mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
- if (inst_size == 0)
- m_opcode.Clear();
- else {
- m_opcode.SetOpcodeBytes(opcode_data, inst_size);
- m_is_valid = true;
+ uint64_t inst_size = 0;
+ m_is_valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len,
+ pc, inst, inst_size);
+ m_opcode.Clear();
+ if (inst_size != 0) {
+ if (arch.GetTriple().isRISCV())
+ m_opcode.SetOpcode16_32TupleBytes(opcode_data, inst_size,
+ byte_order);
+ else
+ m_opcode.SetOpcodeBytes(opcode_data, inst_size);
}
}
}
@@ -604,10 +614,11 @@ class InstructionLLVMC : public lldb_private::Instruction {
const uint8_t *opcode_data = data.GetDataStart();
const size_t opcode_data_len = data.GetByteSize();
llvm::MCInst inst;
- size_t inst_size =
- mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
+ uint64_t inst_size = 0;
+ bool valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc,
+ inst, inst_size);
- if (inst_size > 0) {
+ if (valid && inst_size > 0) {
mc_disasm_ptr->SetStyle(use_hex_immediates, hex_style);
const bool saved_use_color = mc_disasm_ptr->GetUseColor();
@@ -1206,9 +1217,10 @@ class InstructionLLVMC : public lldb_private::Instruction {
const uint8_t *opcode_data = data.GetDataStart();
const size_t opcode_data_len = data.GetByteSize();
llvm::MCInst inst;
- const size_t inst_size =
- mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len, pc, inst);
- if (inst_size == 0)
+ uint64_t inst_size = 0;
+ const bool valid = mc_disasm_ptr->GetMCInst(opcode_data, opcode_data_len,
+ pc, inst, inst_size);
+ if (!valid)
return;
m_has_visited_instruction = true;
@@ -1337,19 +1349,19 @@ DisassemblerLLVMC::MCDisasmInstance::MCDisasmInstance(
m_asm_info_up && m_context_up && m_disasm_up && m_instr_printer_up);
}
-uint64_t DisassemblerLLVMC::MCDisasmInstance::GetMCInst(
- const uint8_t *opcode_data, size_t opcode_data_len, lldb::addr_t pc,
- llvm::MCInst &mc_inst) const {
+bool DisassemblerLLVMC::MCDisasmInstance::GetMCInst(const uint8_t *opcode_data,
+ size_t opcode_data_len,
+ lldb::addr_t pc,
+ llvm::MCInst &mc_inst,
+ uint64_t &size) const {
llvm::ArrayRef<uint8_t> data(opcode_data, opcode_data_len);
llvm::MCDisassembler::DecodeStatus status;
- uint64_t new_inst_size;
- status = m_disasm_up->getInstruction(mc_inst, new_inst_size, data, pc,
- llvm::nulls());
+ status = m_disasm_up->getInstruction(mc_inst, size, data, pc, llvm::nulls());
if (status == llvm::MCDisassembler::Success)
- return new_inst_size;
+ return true;
else
- return 0;
+ return false;
}
void DisassemblerLLVMC::MCDisasmInstance::PrintMCInst(
diff --git a/lldb/source/Utility/ArchSpec.cpp b/lldb/source/Utility/ArchSpec.cpp
index 70b9800f4dade..7c71aaae6bcf2 100644
--- a/lldb/source/Utility/ArchSpec.cpp
+++ b/lldb/source/Utility/ArchSpec.cpp
@@ -228,9 +228,9 @@ static const CoreDefinition g_core_definitions[] = {
{eByteOrderLittle, 4, 4, 4, llvm::Triple::hexagon,
ArchSpec::eCore_hexagon_hexagonv5, "hexagonv5"},
- {eByteOrderLittle, 4, 2, 4, llvm::Triple::riscv32, ArchSpec::eCore_riscv32,
+ {eByteOrderLittle, 4, 2, 8, llvm::Triple::riscv32, ArchSpec::eCore_riscv32,
"riscv32"},
- {eByteOrderLittle, 8, 2, 4, llvm::Triple::riscv64, ArchSpec::eCore_riscv64,
+ {eByteOrderLittle, 8, 2, 8, llvm::Triple::riscv64, ArchSpec::eCore_riscv64,
"riscv64"},
{eByteOrderLittle, 4, 4, 4, llvm::Triple::loongarch32,
diff --git a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
index 8ec5d62a99ac5..51487034d89cd 100644
--- a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
+++ b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
@@ -14,6 +14,7 @@
#include "lldb/Core/Disassembler.h"
#include "lldb/Target/ExecutionContext.h"
#include "lldb/Utility/ArchSpec.h"
+#include "lldb/Utility/StreamString.h"
#include "Plugins/Disassembler/LLVMC/DisassemblerLLVMC.h"
@@ -90,3 +91,48 @@ TEST_F(TestMCDisasmInstanceRISCV, TestRISCV32Instruction) {
EXPECT_FALSE(inst_sp->IsCall());
EXPECT_TRUE(inst_sp->DoesBranch());
}
+
+TEST_F(TestMCDisasmInstanceRISCV, TestOpcodeBytePrinter) {
+ ArchSpec arch("riscv32-*-linux");
+
+ const unsigned num_of_instructions = 7;
+ uint8_t data[] = {
+ 0x41, 0x11, // addi sp, sp, -0x10
+ 0x06, 0xc6, // sw ra, 0xc(sp)
+ 0x23, 0x2a, 0xa4, 0xfe, // sw a0, -0xc(s0)
+ 0x23, 0x28, 0xa4, 0xfe, // sw a0, -0x10(s0)
+ 0x22, 0x44, // lw s0, 0x8(sp)
+ 0x20, 0x00, 0x20, 0x09, 0x40, 0x00, 0x3F, // 64 bit custom instruction
+ 0x10, 0x00, 0x00, 0x00, 0x02, 0x1f // 48 bit xqci.e.li rd=8 imm=0x1000
+ };
+
+ const char *expected_outputs[] = {"1141", "c606",
+ "fea42a23", "fea42823",
+ "4422", "0940003f 00200020",
+ "021f 0000 1000"};
+
+ EXPECT_EQ(num_of_instructions, sizeof(expected_outputs) / sizeof(char *));
+
+ DisassemblerSP disass_sp;
+ Address start_addr(0x100);
+ disass_sp = Disassembler::DisassembleBytes(
+ arch, nullptr, nullptr, nullptr, nullptr, start_addr, &data, sizeof(data),
+ num_of_instructions, false);
+
+ // If we failed to get a disassembler, we can assume it is because
+ // the llvm we linked against was not built with the riscv target,
+ // and we should skip these tests without marking anything as failing.
+ if (!disass_sp)
+ return;
+
+ const InstructionList inst_list(disass_sp->GetInstructionList());
+ EXPECT_EQ(num_of_instructions, inst_list.GetSize());
+
+ for (size_t i = 0; i < sizeof(expected_outputs) / sizeof(char *); i++) {
+ InstructionSP inst_sp;
+ StreamString s;
+ inst_sp = inst_list.GetInstructionAtIndex(i);
+ inst_sp->GetOpcode().Dump(&s, 1);
+ ASSERT_STREQ(s.GetString().str().c_str(), expected_outputs[i]);
+ }
+}
>From 8880121940ded4b535284911c23fa387aabd16d1 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 10 Jul 2025 22:46:55 -0700
Subject: [PATCH 2/4] un-clang-formatify
---
.../RISCV/TestMCDisasmInstanceRISCV.cpp | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
index 51487034d89cd..de4de32145e09 100644
--- a/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
+++ b/lldb/unittests/Disassembler/RISCV/TestMCDisasmInstanceRISCV.cpp
@@ -106,10 +106,17 @@ TEST_F(TestMCDisasmInstanceRISCV, TestOpcodeBytePrinter) {
0x10, 0x00, 0x00, 0x00, 0x02, 0x1f // 48 bit xqci.e.li rd=8 imm=0x1000
};
- const char *expected_outputs[] = {"1141", "c606",
- "fea42a23", "fea42823",
- "4422", "0940003f 00200020",
- "021f 0000 1000"};
+ // clang-format off
+ const char *expected_outputs[] = {
+ "1141",
+ "c606",
+ "fea42a23",
+ "fea42823",
+ "4422",
+ "0940003f 00200020",
+ "021f 0000 1000"
+ };
+ // clang-format on
EXPECT_EQ(num_of_instructions, sizeof(expected_outputs) / sizeof(char *));
>From 5825ddfe556f424a884d792787ca8ad13d3aa7dd Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 10 Jul 2025 22:50:38 -0700
Subject: [PATCH 3/4] fix typeo
---
lldb/source/Core/Opcode.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lldb/source/Core/Opcode.cpp b/lldb/source/Core/Opcode.cpp
index 15ae87ea99f9f..34319dc5b0a93 100644
--- a/lldb/source/Core/Opcode.cpp
+++ b/lldb/source/Core/Opcode.cpp
@@ -46,7 +46,7 @@ int Opcode::Dump(Stream *s, uint32_t min_byte_width) const {
// to reverse the bytes manually.
if (GetEndianSwap())
for (int i = 0; i < m_data.inst.length; i++)
- buf[m_data.inst.length - i] = m_data.inst.bytes[i];
+ buf[m_data.inst.length - i - 1] = m_data.inst.bytes[i];
else
memcpy(buf, m_data.inst.bytes, m_data.inst.length);
uint32_t i = 0;
>From 6a775cbac6a0246a3514e0a2cde700e7363461f7 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda at apple.com>
Date: Thu, 10 Jul 2025 23:22:18 -0700
Subject: [PATCH 4/4] clarify dump method a bit more.
---
lldb/source/Core/Opcode.cpp | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/lldb/source/Core/Opcode.cpp b/lldb/source/Core/Opcode.cpp
index 34319dc5b0a93..d5cf12245a8dc 100644
--- a/lldb/source/Core/Opcode.cpp
+++ b/lldb/source/Core/Opcode.cpp
@@ -49,19 +49,20 @@ int Opcode::Dump(Stream *s, uint32_t min_byte_width) const {
buf[m_data.inst.length - i - 1] = m_data.inst.bytes[i];
else
memcpy(buf, m_data.inst.bytes, m_data.inst.length);
+
+ const bool format_as_words = !(m_data.inst.length % 4);
uint32_t i = 0;
while (i < m_data.inst.length) {
if (i > 0)
s->PutChar(' ');
- // if size % 4 == 0, print as 1 or 2 32 bit values (32 or 64 bit inst)
- if (!(m_data.inst.length % 4)) {
+ if (format_as_words) {
+ // Format as words; print 1 or more UInt32 values.
uint32_t value;
memcpy(&value, &buf[i], 4);
s->Printf("%8.8x", value);
i += 4;
- // else if size % 2 == 0, print as 1 or 3 16 bit values (16 or 48 bit
- // inst)
- } else if (!(m_data.inst.length % 2)) {
+ } else {
+ // Format as halfwords; print 1 or more UInt16 values.
uint16_t value;
memcpy(&value, &buf[i], 2);
s->Printf("%4.4x", value);
More information about the lldb-commits
mailing list