[llvm-commits] [llvm] r152506 - in /llvm/trunk/bindings/python/llvm: disassembler.py tests/test_disassembler.py

Gregory Szorc gregory.szorc at gmail.com
Sat Mar 10 13:05:05 PST 2012


Author: gps
Date: Sat Mar 10 15:05:05 2012
New Revision: 152506

URL: http://llvm.org/viewvc/llvm-project?rev=152506&view=rev
Log:
[llvm.py] Implement interface to enhanced disassembler

This requires a C++ change to EDDisassembler's ctor to function properly
(the llvm::InitializeAll* functions aren't being called currently and
there is no way to call them from Python).

Code is partially tested and works well enough for initial commit. There
are probably many small bugs.

Added:
    llvm/trunk/bindings/python/llvm/disassembler.py
    llvm/trunk/bindings/python/llvm/tests/test_disassembler.py

Added: llvm/trunk/bindings/python/llvm/disassembler.py
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/bindings/python/llvm/disassembler.py?rev=152506&view=auto
==============================================================================
--- llvm/trunk/bindings/python/llvm/disassembler.py (added)
+++ llvm/trunk/bindings/python/llvm/disassembler.py Sat Mar 10 15:05:05 2012
@@ -0,0 +1,564 @@
+#===- disassembler.py - Python LLVM Bindings -----------------*- python -*--===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+#===------------------------------------------------------------------------===#
+
+from abc import ABCMeta
+from abc import abstractmethod
+
+from ctypes import CFUNCTYPE
+from ctypes import POINTER
+from ctypes import byref
+from ctypes import c_char_p
+from ctypes import c_int
+from ctypes import c_ubyte
+from ctypes import c_uint64
+from ctypes import c_uint
+from ctypes import c_void_p
+from ctypes import memmove
+
+from .common import CachedProperty
+from .common import LLVMObject
+from .common import c_object_p
+from .common import get_library
+
+__all__ = [
+    'DisassemblerByteArraySource',
+    'DisassemblerFileSource',
+    'DisassemblerSource',
+    'Disassembler',
+    'Instruction',
+    'Operand',
+    'Token',
+]
+
+callbacks = {}
+
+class DisassemblerSource:
+    """Abstract base class for disassembler input.
+
+    This defines the interface to which inputs to the disassembler must
+    conform.
+
+    Basically, the disassembler input is a read-only sequence of a finite
+    length.
+    """
+    __metaclass__ = ABCMeta
+
+    @abstractmethod
+    def __len__(self):
+        """Returns the number of bytes that are available for input."""
+        pass
+
+    @abstractmethod
+    def get_byte(self, address):
+        """Returns the byte at the specified address."""
+        pass
+
+    @abstractmethod
+    def start_address(self):
+        """Returns the address at which to start fetch bytes, as a long."""
+        pass
+
+class DisassemblerByteArraySource(DisassemblerSource):
+    """A disassembler source for byte arrays."""
+
+    def __init__(self, b):
+        self._array = b
+
+    def __len__(self):
+        return len(self._array)
+
+    def get_byte(self, address):
+        return self._array[address]
+
+    def start_address(self):
+        return 0
+
+class DisassemblerFileSource(DisassemblerSource):
+    """A disassembler source for file segments.
+
+    This allows you to feed in segments of a file into a Disassembler.
+    """
+
+    def __init__(self, filename, start_offset, length=None, end_offset=None,
+                 start_address=None):
+        """Create a new source from a file.
+
+        A source begins at a specified byte offset and can be defined in terms
+        of byte length of the end byte offset.
+        """
+        if length is None and end_offset is None:
+            raise Exception('One of length or end_offset must be defined.')
+
+        self._start_address = start_address
+        if self._start_address is None:
+            self._start_address = 0
+
+        count = length
+        if length is None:
+            count = end_offset - start_offset
+
+        with open(filename, 'rb') as fh:
+            fh.seek(start_offset)
+
+            # FIXME handle case where read bytes != requested
+            self._buf = fh.read(count)
+
+    def __len__(self):
+        return len(self._buf)
+
+    def get_byte(self, address):
+        return self._buf[address - self._start_address]
+
+    def start_address(self):
+        return self._start_address
+
+class Disassembler(LLVMObject):
+    """Interface to LLVM's enhanced disassembler.
+
+    The API is slightly different from the C API in that we tightly couple a
+    disassembler instance to an input source. This saves an extra level of
+    abstraction and makes the Python implementation easier.
+    """
+
+    SYNTAX_X86_INTEL = 0
+    SYNTAX_X86_ATT = 1
+    SYNTAX_ARM_UAL = 2
+
+    def __init__(self, triple, source, syntax=0):
+        """Create a new disassembler instance.
+
+        Arguments:
+
+        triple -- str target type (e.g. x86_64-apple-darwin10)
+        source -- DisassemblerSource instance to be fed into this disassembler.
+        syntax -- The assembly syntax to use. One of the SYNTAX_* class
+            constants. e.g. EnhancedDisassembler.SYNTAX_X86_INTEL
+        """
+        assert isinstance(source, DisassemblerSource)
+
+        ptr = c_object_p()
+        result = lib.EDGetDisassembler(byref(ptr), c_char_p(triple),
+                                       c_int(syntax))
+        if result != 0:
+            raise Exception('Non-0 return code.')
+
+        LLVMObject.__init__(self, ptr)
+
+        self._source = source
+
+    def get_instructions(self):
+        """Obtain the instructions from the input.
+
+        This is a generator for Instruction instances.
+
+        By default, this will return instructions for the entire source which
+        has been defined. It does this by querying the source's start_address()
+        method and continues to request instructions until len(source) is
+        exhausted.
+        """
+
+        # We currently obtain 1 instruction at a time because it is easiest.
+
+        # This serves as our EDByteReaderCallback. It is a proxy between C and
+        # the Python DisassemblerSource.
+        def byte_reader(dest, address, arg):
+            try:
+                byte = self._source.get_byte(address)
+                memmove(dest, byte, 1)
+
+                return 0
+            except:
+                return -1
+
+        address = self._source.start_address()
+        end_address = address + len(self._source)
+        cb = callbacks['byte_reader'](byte_reader)
+        while address < end_address:
+            ptr = c_object_p()
+
+            result = lib.EDCreateInsts(byref(ptr), c_uint(1), self, cb,
+                                       address, c_void_p(None))
+
+            if result != 1:
+                raise Exception('Error obtaining instruction at address %d' %
+                        address)
+
+            instruction = Instruction(ptr, self)
+            yield instruction
+
+            address += instruction.byte_size
+
+
+class Instruction(LLVMObject):
+    """Represents an individual instruction.
+
+    Instruction instances are obtained from Disassembler.get_instructions().
+    """
+    def __init__(self, ptr, disassembler):
+        """Create a new instruction.
+
+        Instructions are created from within this module. You should have no
+        need to call this from outside this module.
+        """
+        assert isinstance(ptr, c_object_p)
+        assert isinstance(disassembler, Disassembler)
+
+        LLVMObject.__init__(self, ptr, disposer=lib.EDReleaseInst)
+        self._disassembler = disassembler
+
+    def __str__(self):
+        s = c_char_p(None)
+        result = lib.EDGetInstString(byref(s), self)
+        if result != 0:
+            raise Exception('Non-0 return code.')
+
+        return s.value
+
+    @CachedProperty
+    def byte_size(self):
+        result = lib.EDInstByteSize(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result
+
+    @CachedProperty
+    def id(self):
+        i = c_uint()
+        result = lib.EDInstID(byref(i), self)
+        if result != 0:
+            raise Exception('Non-0 return code.')
+
+        return i.value
+
+    @CachedProperty
+    def is_branch(self):
+        result = lib.EDInstIsBranch(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def is_move(self):
+        result = lib.EDInstIsMove(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def branch_target_id(self):
+        result = lib.EDBranchTargetID(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result
+
+    @CachedProperty
+    def move_source_id(self):
+        result = lib.EDMoveSourceID(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result
+
+    def get_tokens(self):
+        """Obtain the tokens in this instruction.
+
+        This is a generator for Token instances.
+        """
+        count = lib.EDNumTokens(self)
+        if count == -1:
+            raise Exception('Error code returned.')
+
+        for i in range(0, count):
+            ptr = c_object_p()
+            result = lib.EDGetToken(byref(ptr), self, c_int(i))
+            if result != 0:
+                raise Exception('Non-0 return code.')
+
+            yield Token(ptr, self)
+
+    def get_operands(self):
+        """Obtain the operands in this instruction.
+
+        This is a generator for Operand instances.
+        """
+        count = lib.EDNumOperands(self)
+        if count == -1:
+            raise Exception('Error code returned.')
+
+        for i in range(0, count):
+            ptr = c_object_p()
+            result = lib.EDGetOperand(byref(ptr), self, c_int(i))
+            if result != 0:
+                raise Exception('Non-0 return code.')
+
+            yield Operand(ptr, self)
+
+class Token(LLVMObject):
+    def __init__(self, ptr, instruction):
+        assert isinstance(ptr, c_object_p)
+        assert isinstance(instruction, Instruction)
+
+        LLVMObject.__init__(self, ptr)
+
+        self._instruction = instruction
+
+    def __str__(self):
+        s = c_char_p(None)
+        result = lib.EDGetTokenString(byref(s), self)
+        if result != 0:
+            raise Exception('Non-0 return code.')
+
+        return s.value
+
+    @CachedProperty
+    def operand_index(self):
+        result = lib.EDOperandIndexForToken(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result
+
+    @CachedProperty
+    def is_whitespace(self):
+        result = lib.EDTokenIsWhitespace(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def is_punctuation(self):
+        result = lib.EDTokenIsPunctuation(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def is_opcode(self):
+        result = lib.EDTokenIsOpcode(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def is_literal(self):
+        result = lib.EDTokenIsLiteral(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def is_register(self):
+        result = lib.EDTokenIsRegister(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def is_negative_literal(self):
+        result = lib.EDTokenIsNegativeLiteral(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def absolute_value(self):
+        value = c_uint64()
+        result = lib.EDLiteralTokenAbsoluteValue(byref(value), self)
+        if result != 0:
+            raise Exception('Non-0 return code.')
+
+        return value
+
+    @CachedProperty
+    def register_value(self):
+        value = c_uint()
+        result = lib.EDRegisterTokenValue(byref(value), self)
+        if result != 0:
+            raise Exception('Non-0 return code.')
+
+        return value
+
+class Operand(LLVMObject):
+    """Represents an operand in an instruction.
+
+    FIXME support register evaluation.
+    """
+    def __init__(self, ptr, instruction):
+        assert isinstance(ptr, c_object_p)
+        assert isinstance(instruction, Instruction)
+
+        LLVMObject.__init__(self, ptr)
+
+        self._instruction = instruction
+
+    @CachedProperty
+    def is_register(self):
+        result = lib.EDOperandIsRegister(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def is_immediate(self):
+        result = lib.EDOperandIsImmediate(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def is_memory(self):
+        result = lib.EDOperandIsMemory(self)
+        if result == -1:
+            raise Exception('Error code returned.')
+
+        return result > 0
+
+    @CachedProperty
+    def register_value(self):
+        value = c_uint()
+        result = lib.EDRegisterOperandValue(byref(value), self)
+        if result != 0:
+            raise Exception('Non-0 return code.')
+
+        return value
+
+    @CachedProperty
+    def immediate_value(self):
+        value = c_uint64()
+        result = lib.EDImmediateOperandValue(byref(value), self)
+        if result != 0:
+            raise Exception('Non-0 return code.')
+
+        return value
+
+def register_library(library):
+    library.EDGetDisassembler.argtypes = [POINTER(c_object_p), c_char_p, c_int]
+    library.EDGetDisassembler.restype = c_int
+
+    library.EDGetRegisterName.argtypes = [POINTER(c_char_p), Disassembler,
+            c_uint]
+    library.EDGetRegisterName.restype = c_int
+
+    library.EDRegisterIsStackPointer.argtypes = [Disassembler, c_uint]
+    library.EDRegisterIsStackPointer.restype = c_int
+
+    library.EDRegisterIsProgramCounter.argtypes = [Disassembler, c_uint]
+    library.EDRegisterIsProgramCounter.restype = c_int
+
+    library.EDCreateInsts.argtypes = [POINTER(c_object_p), c_uint,
+            Disassembler, callbacks['byte_reader'], c_uint64, c_void_p]
+    library.EDCreateInsts.restype = c_uint
+
+    library.EDReleaseInst.argtypes = [Instruction]
+
+    library.EDInstByteSize.argtypes = [Instruction]
+    library.EDInstByteSize.restype = c_int
+
+    library.EDGetInstString.argtypes = [POINTER(c_char_p), Instruction]
+    library.EDGetInstString.restype = c_int
+
+    library.EDInstID.argtypes = [POINTER(c_uint), Instruction]
+    library.EDInstID.restype = c_int
+
+    library.EDInstIsBranch.argtypes = [Instruction]
+    library.EDInstIsBranch.restype = c_int
+
+    library.EDInstIsMove.argtypes = [Instruction]
+    library.EDInstIsMove.restype = c_int
+
+    library.EDBranchTargetID.argtypes = [Instruction]
+    library.EDBranchTargetID.restype = c_int
+
+    library.EDMoveSourceID.argtypes = [Instruction]
+    library.EDMoveSourceID.restype = c_int
+
+    library.EDMoveTargetID.argtypes = [Instruction]
+    library.EDMoveTargetID.restype = c_int
+
+    library.EDNumTokens.argtypes = [Instruction]
+    library.EDNumTokens.restype = c_int
+
+    library.EDGetToken.argtypes = [POINTER(c_object_p), Instruction, c_int]
+    library.EDGetToken.restype = c_int
+
+    library.EDGetTokenString.argtypes = [POINTER(c_char_p), Token]
+    library.EDGetTokenString.restype = c_int
+
+    library.EDOperandIndexForToken.argtypes = [Token]
+    library.EDOperandIndexForToken.restype = c_int
+
+    library.EDTokenIsWhitespace.argtypes = [Token]
+    library.EDTokenIsWhitespace.restype = c_int
+
+    library.EDTokenIsPunctuation.argtypes = [Token]
+    library.EDTokenIsPunctuation.restype = c_int
+
+    library.EDTokenIsOpcode.argtypes = [Token]
+    library.EDTokenIsOpcode.restype = c_int
+
+    library.EDTokenIsLiteral.argtypes = [Token]
+    library.EDTokenIsLiteral.restype = c_int
+
+    library.EDTokenIsRegister.argtypes = [Token]
+    library.EDTokenIsRegister.restype = c_int
+
+    library.EDTokenIsNegativeLiteral.argtypes = [Token]
+    library.EDTokenIsNegativeLiteral.restype = c_int
+
+    library.EDLiteralTokenAbsoluteValue.argtypes = [POINTER(c_uint64), Token]
+    library.EDLiteralTokenAbsoluteValue.restype = c_int
+
+    library.EDRegisterTokenValue.argtypes = [POINTER(c_uint), Token]
+    library.EDRegisterTokenValue.restype = c_int
+
+    library.EDNumOperands.argtypes = [Instruction]
+    library.EDNumOperands.restype = c_int
+
+    library.EDGetOperand.argtypes = [POINTER(c_object_p), Instruction, c_int]
+    library.EDGetOperand.restype = c_int
+
+    library.EDOperandIsRegister.argtypes = [Operand]
+    library.EDOperandIsRegister.restype = c_int
+
+    library.EDOperandIsImmediate.argtypes = [Operand]
+    library.EDOperandIsImmediate.restype = c_int
+
+    library.EDOperandIsMemory.argtypes = [Operand]
+    library.EDOperandIsMemory.restype = c_int
+
+    library.EDRegisterOperandValue.argtypes = [POINTER(c_uint), Operand]
+    library.EDRegisterOperandValue.restype = c_int
+
+    library.EDImmediateOperandValue.argtypes = [POINTER(c_uint64), Operand]
+    library.EDImmediateOperandValue.restype = c_int
+
+    library.EDEvaluateOperand.argtypes = [c_uint64, Operand,
+        callbacks['register_reader'], c_void_p]
+    library.EDEvaluateOperand.restype = c_int
+
+# Enhanced disassembler.
+callbacks['byte_reader'] = CFUNCTYPE(c_int, POINTER(c_ubyte), c_uint64,
+                                     c_void_p)
+callbacks['register_reader'] = CFUNCTYPE(c_int, POINTER(c_uint64), c_uint,
+                                         c_void_p)
+
+lib = get_library()
+register_library(lib)

Added: llvm/trunk/bindings/python/llvm/tests/test_disassembler.py
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/bindings/python/llvm/tests/test_disassembler.py?rev=152506&view=auto
==============================================================================
--- llvm/trunk/bindings/python/llvm/tests/test_disassembler.py (added)
+++ llvm/trunk/bindings/python/llvm/tests/test_disassembler.py Sat Mar 10 15:05:05 2012
@@ -0,0 +1,62 @@
+from unittest import expectedFailure
+from unittest import skip
+
+from .base import TestBase
+from ..disassembler import DisassemblerByteArraySource
+from ..disassembler import DisassemblerFileSource
+from ..disassembler import Disassembler
+from ..object import ObjectFile
+
+class TestDisassembler(TestBase):
+    def test_simple(self):
+        sequence = '\x67\xe3\x81' # jcxz -127
+        triple = 'i686-apple-darwin9'
+
+        source = DisassemblerByteArraySource(sequence)
+
+        disassembler = Disassembler(triple, source)
+        instructions = list(disassembler.get_instructions())
+
+        self.assertEqual(len(instructions), 1)
+
+        i = instructions[0]
+        self.assertEqual(str(i), '\tjcxz\t-127\n')
+        self.assertEqual(i.byte_size, 3)
+        self.assertEqual(i.id, 1032)
+        self.assertTrue(i.is_branch)
+        self.assertFalse(i.is_move)
+        self.assertEqual(i.branch_target_id, 0)
+
+        tokens = list(i.get_tokens())
+        self.assertEqual(len(tokens), 4)
+        token = tokens[0]
+        self.assertEqual(str(token), 'jcxz')
+        self.assertFalse(token.is_whitespace)
+        self.assertFalse(token.is_punctuation)
+        self.assertTrue(token.is_opcode)
+        self.assertFalse(token.is_literal)
+        self.assertFalse(token.is_register)
+
+        self.assertTrue(tokens[1].is_whitespace)
+
+        operands = list(i.get_operands())
+        self.assertEqual(len(operands), 1)
+
+        # TODO implement operand tests
+
+    @skip('This test is horribly broken and probably not even correct.')
+    def test_read_instructions(self):
+        filename = self.get_test_binary()
+        o = ObjectFile(filename=filename)
+
+        for symbol in o.get_symbols():
+            address = symbol.address
+            offset = symbol.file_offset
+            size = symbol.size
+
+            source = DisassemblerFileSource(filename, offset, length=size,
+                                            start_address=address)
+
+            disassembler = Disassembler('x86-generic-gnu-linux', source)
+            for instruction in disassembler.get_instructions():
+                print instruction





More information about the llvm-commits mailing list