fix ints_to_bytes performance (#1761)

* fix ints_to_bytes performance
2025-12-12 15:49:46 -08:00 · 2023-08-24 17:01:41 -06:00
parent bd2f7bc1f4
commit 448b122ef0
4 changed files with 28 additions and 46 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -28,6 +28,7 @@
 - ELF: fix parsing of symtab #1704 @williballenthin
 - result document: don't use deprecated pydantic functions #1718 @williballenthin
 - pytest: don't mark IDA tests as pytest tests #1719 @williballenthin
+- ghidra: fix ints_to_bytes performance #1761 @mike-hunhoff

 ### capa explorer IDA Pro plugin
 - fix unhandled exception when resolving rule path #1693 @mike-hunhoff
--- a/capa/features/extractors/ghidra/file.py
+++ b/capa/features/extractors/ghidra/file.py
@@ -22,7 +22,7 @@ from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, Absolu
 MAX_OFFSET_PE_AFTER_MZ = 0x200


-def check_segment_for_pe() -> Iterator[Tuple[int, int]]:
+def find_embedded_pe() -> Iterator[Tuple[int, int]]:
    """check segment for embedded PE

    adapted for Ghidra from:
@@ -39,10 +39,11 @@ def check_segment_for_pe() -> Iterator[Tuple[int, int]]:
    ]

    todo = []
+    start_addr = currentProgram().getMinAddress().add(1)  # type: ignore [name-defined] # noqa: F821
    for mzx, pex, i in mz_xor:
        # find all segment offsets containing XOR'd "MZ" bytes
        off: ghidra.program.model.address.GenericAddress
-        for off in capa.features.extractors.ghidra.helpers.find_byte_sequence(mzx):
+        for off in capa.features.extractors.ghidra.helpers.find_byte_sequence(start_addr, mzx):
            todo.append((off, mzx, pex, i))

    seg_max = currentProgram().getMaxAddress()  # type: ignore [name-defined] # noqa: F821
@@ -73,8 +74,7 @@ def check_segment_for_pe() -> Iterator[Tuple[int, int]]:

 def extract_file_embedded_pe() -> Iterator[Tuple[Feature, Address]]:
    """extract embedded PE features"""
-
-    for ea, _ in check_segment_for_pe():
+    for ea, _ in find_embedded_pe():
        yield Characteristic("embedded pe"), FileOffsetAddress(ea)


--- a/capa/features/extractors/ghidra/helpers.py
+++ b/capa/features/extractors/ghidra/helpers.py
@@ -20,24 +20,25 @@ from capa.features.address import AbsoluteVirtualAddress
 from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle


-def fix_byte(b: int) -> bytes:
-    """Transform signed ints from Java into bytes for Python
+def ints_to_bytes(bytez: List[int]) -> bytes:
+    """convert Java signed ints to Python bytes

    args:
-        b: signed int returned from Java processing
+        bytez: list of Java signed ints
    """
-    return (b & 0xFF).to_bytes(1, "little")
+    return bytes([b & 0xFF for b in bytez])


-def find_byte_sequence(seq: bytes) -> Iterator[int]:
+def find_byte_sequence(addr: ghidra.program.model.address.Address, seq: bytes) -> Iterator[int]:
    """yield all ea of a given byte sequence

    args:
+        addr: start address
        seq: bytes to search e.g. b"\x01\x03"
    """
    seqstr = "".join([f"\\x{b:02x}" for b in seq])
-    # .add(1) to avoid false positives on regular PE files
-    eas = findBytes(currentProgram().getMinAddress().add(1), seqstr, java.lang.Integer.MAX_VALUE, 1)  # type: ignore [name-defined] # noqa: F821
+    eas = findBytes(addr, seqstr, java.lang.Integer.MAX_VALUE, 1)  # type: ignore [name-defined] # noqa: F821
+
    yield from eas


@@ -48,15 +49,10 @@ def get_bytes(addr: ghidra.program.model.address.Address, length: int) -> bytes:
        addr: Address to begin pull from
        length: length of bytes to pull
    """
-
-    bytez = b""
    try:
-        signed_ints = getBytes(addr, length)  # type: ignore [name-defined] # noqa: F821
-        for b in signed_ints:
-            bytez = bytez + fix_byte(b)
-        return bytez
+        return ints_to_bytes(getBytes(addr, length))  # type: ignore [name-defined] # noqa: F821
    except RuntimeError:
-        return bytez
+        return b""


 def get_block_bytes(block: ghidra.program.model.mem.MemoryBlock) -> bytes:
@@ -65,15 +61,7 @@ def get_block_bytes(block: ghidra.program.model.mem.MemoryBlock) -> bytes:
    args:
        block: MemoryBlock to pull from
    """
-
-    bytez = b""
-    try:
-        signed_ints = getBytes(block.getStart(), block.getEnd().getOffset() - block.getStart().getOffset())  # type: ignore [name-defined] # noqa: F821
-        for b in signed_ints:
-            bytez = bytez + fix_byte(b)
-        return bytez
-    except RuntimeError:
-        return bytez
+    return get_bytes(block.getStart(), block.getSize())


 def get_function_symbols() -> Iterator[FunctionHandle]:
--- a/capa/ghidra/helpers.py
+++ b/capa/ghidra/helpers.py
@@ -32,8 +32,9 @@ class GHIDRAIO:

    def __init__(self):
        super().__init__()
+
        self.offset = 0
-        self.bytez = self.get_file_bytes()
+        self.bytes_ = self.get_bytes()

    def seek(self, offset, whence=0):
        assert whence == 0
@@ -42,31 +43,23 @@ class GHIDRAIO:
    def read(self, size):
        logger.debug("reading 0x%x bytes at 0x%x (ea: 0x%x)", size, self.offset, currentProgram().getImageBase().add(self.offset).getOffset())  # type: ignore [name-defined] # noqa: F821

-        b_len = len(self.bytez)
-        if size > b_len - self.offset:
+        if size > len(self.bytes_) - self.offset:
            logger.debug("cannot read 0x%x bytes at 0x%x (ea: BADADDR)", size, self.offset)
            return b""
        else:
-            read_bytes = b""
-            read = [
-                capa.features.extractors.ghidra.helpers.fix_byte(b)
-                for b in self.bytez[self.offset : self.offset + size]
-            ]
-            for b in read:
-                read_bytes = read_bytes + b
-            return read_bytes
+            return self.bytes_[self.offset : self.offset + size]

    def close(self):
        return

-    def get_file_bytes(self):
-        fbytes = currentProgram().getMemory().getAllFileBytes()[0]  # type: ignore [name-defined] # noqa: F821
-        bytez = b""
-        for i in range(fbytes.getSize()):
-            # getOriginalByte() allows for raw file parsing on the Ghidra side
-            # other functions will fail as Ghidra will think that it's reading uninitialized memory
-            bytez = bytez + capa.features.extractors.ghidra.helpers.fix_byte(fbytes.getOriginalByte(i))
-        return bytez
+    def get_bytes(self):
+        file_bytes = currentProgram().getMemory().getAllFileBytes()[0]  # type: ignore [name-defined] # noqa: F821
+
+        # getOriginalByte() allows for raw file parsing on the Ghidra side
+        # other functions will fail as Ghidra will think that it's reading uninitialized memory
+        bytes_ = [file_bytes.getOriginalByte(i) for i in range(file_bytes.getSize())]
+
+        return capa.features.extractors.ghidra.helpers.ints_to_bytes(bytes_)


 def is_supported_ghidra_version():