[llvm] [llvm-mc] Add --hex to disassemble hex pairs (PR #119992)

Fangrui Song via llvm-commits llvm-commits at lists.llvm.org
Sat Dec 14 17:32:02 PST 2024


https://github.com/MaskRay created https://github.com/llvm/llvm-project/pull/119992

`--disassemble`/`--cdis` parses input bytes as decimal, 0bbin, 0ooct, or
0xhex. While the hexadecimal digit form is most commonly used, requiring
a 0x prefix for each byte (`0x48 0x29 0xc3`) is cumbersome.

This patch adds --hex to disassemble hex pairs, similar to many other
disassemblers, e.g.

```
% rz-asm -a x86 -b 64 -d 4829c34829c4   # rizin
sub rbx, rax
sub rsp, rax

% echo 4829c34829c4 | llvm-mc -triple=x86_64 --cdis --hex --output-asm-variant=1
        .text
        sub     rbx, rax
        sub     rsp, rax
```


>From f534b187dfb751dc7dadb26ef1d9d412dd1d55c9 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i at maskray.me>
Date: Sat, 14 Dec 2024 17:31:49 -0800
Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?=
 =?UTF-8?q?l=20version?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Created using spr 1.3.5-bogner
---
 llvm/docs/CommandGuide/llvm-mc.rst          |  5 ++
 llvm/test/MC/Disassembler/X86/hex-pairs.txt | 58 +++++++++++++++++++++
 llvm/tools/llvm-mc/Disassembler.cpp         | 33 ++++++++----
 llvm/tools/llvm-mc/Disassembler.h           |  2 +-
 llvm/tools/llvm-mc/llvm-mc.cpp              |  6 ++-
 5 files changed, 93 insertions(+), 11 deletions(-)
 create mode 100644 llvm/test/MC/Disassembler/X86/hex-pairs.txt

diff --git a/llvm/docs/CommandGuide/llvm-mc.rst b/llvm/docs/CommandGuide/llvm-mc.rst
index c5d2f9396dce71..ba568da6f9aeb6 100644
--- a/llvm/docs/CommandGuide/llvm-mc.rst
+++ b/llvm/docs/CommandGuide/llvm-mc.rst
@@ -92,6 +92,11 @@ End-user Options
 
  Generate DWARF debugging info for assembly source files.
 
+.. option:: --hex
+
+ Take hex pairs as input for the disassembler.
+ Whitespace is ignored.
+
 .. option:: --large-code-model
 
  Create CFI directives that assume the code might be more than 2 GB.
diff --git a/llvm/test/MC/Disassembler/X86/hex-pairs.txt b/llvm/test/MC/Disassembler/X86/hex-pairs.txt
new file mode 100644
index 00000000000000..7c759a1853b96b
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/hex-pairs.txt
@@ -0,0 +1,58 @@
+# RUN: rm -rf %t && split-file %s %t && cd %t
+# RUN: llvm-mc -triple=x86_64 --disassemble --hex a.s | FileCheck %s
+# RUN: llvm-mc -triple=x86_64 --disassemble --hex decode1.s 2>&1 | FileCheck %s --check-prefix=DECODE1 --implicit-check-not=warning:
+# RUN: not llvm-mc -triple=x86_64 --disassemble --hex decode2.s 2>&1 | FileCheck %s --check-prefix=DECODE2 --implicit-check-not=warning:
+# RUN: not llvm-mc -triple=x86_64 --disassemble --hex err1.s 2>&1 | FileCheck %s --check-prefix=ERR1 --implicit-check-not=error:
+# RUN: not llvm-mc -triple=x86_64 --disassemble --hex err2.s 2>&1 | FileCheck %s --check-prefix=ERR2 --implicit-check-not=error:
+
+#--- a.s
+4883ec08 31  # comment
+# comment
+	ed4829 c390
+[c3c3][4829c3]
+[90]
+
+# CHECK:      subq $8, %rsp
+# CHECK-NEXT: xorl %ebp, %ebp
+# CHECK-NEXT: subq %rax, %rbx
+# CHECK-NEXT: nop
+# CHECK-NEXT: retq
+# CHECK-NEXT: retq
+# CHECK-NEXT: subq %rax, %rbx
+# CHECK-NEXT: nop
+# CHECK-EMPTY:
+
+#--- decode1.s
+4889
+
+# DECODE1: 1:1: warning: invalid instruction encoding
+
+#--- decode2.s
+[4889][4889] [4889]4889c3
+	[4889]
+
+# DECODE2: 1:2: warning: invalid instruction encoding
+# DECODE2: 1:8: warning: invalid instruction encoding
+# DECODE2: 1:15: warning: invalid instruction encoding
+# DECODE2: 2:3: warning: invalid instruction encoding
+
+#--- err1.s
+0x31ed
+0xcc
+
+# ERR1:      1:1: error: invalid input token
+# ERR1:      2:1: error: invalid input token
+# ERR1:      xorl %ebp, %ebp
+# ERR1-NEXT: int3
+# ERR1-EMPTY:
+
+#--- err2.s
+90c
+cc
+c
+
+# ERR2:      1:3: error: expected two hex digits
+# ERR2:      3:1: error: expected two hex digits
+# ERR2:      nop
+# ERR2-NEXT: int3
+# ERR2-EMPTY:
diff --git a/llvm/tools/llvm-mc/Disassembler.cpp b/llvm/tools/llvm-mc/Disassembler.cpp
index a588058437ec9a..f96ccd17f1b6c5 100644
--- a/llvm/tools/llvm-mc/Disassembler.cpp
+++ b/llvm/tools/llvm-mc/Disassembler.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Disassembler.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
@@ -94,10 +95,8 @@ static bool SkipToToken(StringRef &Str) {
   }
 }
 
-
-static bool ByteArrayFromString(ByteArrayTy &ByteArray,
-                                StringRef &Str,
-                                SourceMgr &SM) {
+static bool byteArrayFromString(ByteArrayTy &ByteArray, StringRef &Str,
+                                SourceMgr &SM, bool HexPairs) {
   while (SkipToToken(Str)) {
     // Handled by higher level
     if (Str[0] == '[' || Str[0] == ']')
@@ -109,7 +108,24 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
 
     // Convert to a byte and add to the byte vector.
     unsigned ByteVal;
-    if (Value.getAsInteger(0, ByteVal) || ByteVal > 255) {
+    if (HexPairs) {
+      if (Next < 2) {
+        SM.PrintMessage(SMLoc::getFromPointer(Value.data()),
+                        SourceMgr::DK_Error, "expected two hex digits");
+        Str = Str.substr(Next);
+        return true;
+      }
+      Next = 2;
+      unsigned C0 = hexDigitValue(Value[0]);
+      unsigned C1 = hexDigitValue(Value[1]);
+      if (C0 == -1u || C1 == -1u) {
+        SM.PrintMessage(SMLoc::getFromPointer(Value.data()),
+                        SourceMgr::DK_Error, "invalid input token");
+        Str = Str.substr(Next);
+        return true;
+      }
+      ByteVal = C0 * 16 + C1;
+    } else if (Value.getAsInteger(0, ByteVal) || ByteVal > 255) {
       // If we have an error, print it and skip to the end of line.
       SM.PrintMessage(SMLoc::getFromPointer(Value.data()), SourceMgr::DK_Error,
                       "invalid input token");
@@ -130,9 +146,8 @@ static bool ByteArrayFromString(ByteArrayTy &ByteArray,
 int Disassembler::disassemble(const Target &T, const std::string &Triple,
                               MCSubtargetInfo &STI, MCStreamer &Streamer,
                               MemoryBuffer &Buffer, SourceMgr &SM,
-                              MCContext &Ctx,
-                              const MCTargetOptions &MCOptions) {
-
+                              MCContext &Ctx, const MCTargetOptions &MCOptions,
+                              bool HexPairs) {
   std::unique_ptr<const MCRegisterInfo> MRI(T.createMCRegInfo(Triple));
   if (!MRI) {
     errs() << "error: no register info for target " << Triple << "\n";
@@ -188,7 +203,7 @@ int Disassembler::disassemble(const Target &T, const std::string &Triple,
     }
 
     // It's a real token, get the bytes and emit them
-    ErrorOccurred |= ByteArrayFromString(ByteArray, Str, SM);
+    ErrorOccurred |= byteArrayFromString(ByteArray, Str, SM, HexPairs);
 
     if (!ByteArray.first.empty())
       ErrorOccurred |=
diff --git a/llvm/tools/llvm-mc/Disassembler.h b/llvm/tools/llvm-mc/Disassembler.h
index d0226abadc630a..68f32066ccd89c 100644
--- a/llvm/tools/llvm-mc/Disassembler.h
+++ b/llvm/tools/llvm-mc/Disassembler.h
@@ -32,7 +32,7 @@ class Disassembler {
   static int disassemble(const Target &T, const std::string &Triple,
                          MCSubtargetInfo &STI, MCStreamer &Streamer,
                          MemoryBuffer &Buffer, SourceMgr &SM, MCContext &Ctx,
-                         const MCTargetOptions &MCOptions);
+                         const MCTargetOptions &MCOptions, bool HexPairs);
 };
 
 } // namespace llvm
diff --git a/llvm/tools/llvm-mc/llvm-mc.cpp b/llvm/tools/llvm-mc/llvm-mc.cpp
index 898d79b9233b9a..04d94d474df466 100644
--- a/llvm/tools/llvm-mc/llvm-mc.cpp
+++ b/llvm/tools/llvm-mc/llvm-mc.cpp
@@ -94,6 +94,10 @@ static cl::opt<bool>
                 cl::desc("Prefer hex format for immediate values"),
                 cl::cat(MCCategory));
 
+static cl::opt<bool>
+    HexPairs("hex", cl::desc("Take hex pairs as input for the disassembler"),
+             cl::cat(MCCategory));
+
 static cl::list<std::string>
     DefineSymbol("defsym",
                  cl::desc("Defines a symbol to be an integer constant"),
@@ -592,7 +596,7 @@ int main(int argc, char **argv) {
   }
   if (disassemble)
     Res = Disassembler::disassemble(*TheTarget, TripleName, *STI, *Str, *Buffer,
-                                    SrcMgr, Ctx, MCOptions);
+                                    SrcMgr, Ctx, MCOptions, HexPairs);
 
   // Keep output if no errors.
   if (Res == 0) {



More information about the llvm-commits mailing list