fix ints_to_bytes performance (#1761)

* fix ints_to_bytes performance
This commit is contained in:
Mike Hunhoff
2023-08-24 17:01:41 -06:00
committed by GitHub
parent bd2f7bc1f4
commit 448b122ef0
4 changed files with 28 additions and 46 deletions

View File

@@ -28,6 +28,7 @@
- ELF: fix parsing of symtab #1704 @williballenthin
- result document: don't use deprecated pydantic functions #1718 @williballenthin
- pytest: don't mark IDA tests as pytest tests #1719 @williballenthin
- ghidra: fix ints_to_bytes performance #1761 @mike-hunhoff
### capa explorer IDA Pro plugin
- fix unhandled exception when resolving rule path #1693 @mike-hunhoff

View File

@@ -22,7 +22,7 @@ from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, Absolu
MAX_OFFSET_PE_AFTER_MZ = 0x200
def check_segment_for_pe() -> Iterator[Tuple[int, int]]:
def find_embedded_pe() -> Iterator[Tuple[int, int]]:
"""check segment for embedded PE
adapted for Ghidra from:
@@ -39,10 +39,11 @@ def check_segment_for_pe() -> Iterator[Tuple[int, int]]:
]
todo = []
start_addr = currentProgram().getMinAddress().add(1) # type: ignore [name-defined] # noqa: F821
for mzx, pex, i in mz_xor:
# find all segment offsets containing XOR'd "MZ" bytes
off: ghidra.program.model.address.GenericAddress
for off in capa.features.extractors.ghidra.helpers.find_byte_sequence(mzx):
for off in capa.features.extractors.ghidra.helpers.find_byte_sequence(start_addr, mzx):
todo.append((off, mzx, pex, i))
seg_max = currentProgram().getMaxAddress() # type: ignore [name-defined] # noqa: F821
@@ -73,8 +74,7 @@ def check_segment_for_pe() -> Iterator[Tuple[int, int]]:
def extract_file_embedded_pe() -> Iterator[Tuple[Feature, Address]]:
"""extract embedded PE features"""
for ea, _ in check_segment_for_pe():
for ea, _ in find_embedded_pe():
yield Characteristic("embedded pe"), FileOffsetAddress(ea)

View File

@@ -20,24 +20,25 @@ from capa.features.address import AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
def fix_byte(b: int) -> bytes:
"""Transform signed ints from Java into bytes for Python
def ints_to_bytes(bytez: List[int]) -> bytes:
"""convert Java signed ints to Python bytes
args:
b: signed int returned from Java processing
bytez: list of Java signed ints
"""
return (b & 0xFF).to_bytes(1, "little")
return bytes([b & 0xFF for b in bytez])
def find_byte_sequence(seq: bytes) -> Iterator[int]:
def find_byte_sequence(addr: ghidra.program.model.address.Address, seq: bytes) -> Iterator[int]:
"""yield all ea of a given byte sequence
args:
addr: start address
seq: bytes to search e.g. b"\x01\x03"
"""
seqstr = "".join([f"\\x{b:02x}" for b in seq])
# .add(1) to avoid false positives on regular PE files
eas = findBytes(currentProgram().getMinAddress().add(1), seqstr, java.lang.Integer.MAX_VALUE, 1) # type: ignore [name-defined] # noqa: F821
eas = findBytes(addr, seqstr, java.lang.Integer.MAX_VALUE, 1) # type: ignore [name-defined] # noqa: F821
yield from eas
@@ -48,15 +49,10 @@ def get_bytes(addr: ghidra.program.model.address.Address, length: int) -> bytes:
addr: Address to begin pull from
length: length of bytes to pull
"""
bytez = b""
try:
signed_ints = getBytes(addr, length) # type: ignore [name-defined] # noqa: F821
for b in signed_ints:
bytez = bytez + fix_byte(b)
return bytez
return ints_to_bytes(getBytes(addr, length)) # type: ignore [name-defined] # noqa: F821
except RuntimeError:
return bytez
return b""
def get_block_bytes(block: ghidra.program.model.mem.MemoryBlock) -> bytes:
@@ -65,15 +61,7 @@ def get_block_bytes(block: ghidra.program.model.mem.MemoryBlock) -> bytes:
args:
block: MemoryBlock to pull from
"""
bytez = b""
try:
signed_ints = getBytes(block.getStart(), block.getEnd().getOffset() - block.getStart().getOffset()) # type: ignore [name-defined] # noqa: F821
for b in signed_ints:
bytez = bytez + fix_byte(b)
return bytez
except RuntimeError:
return bytez
return get_bytes(block.getStart(), block.getSize())
def get_function_symbols() -> Iterator[FunctionHandle]:

View File

@@ -32,8 +32,9 @@ class GHIDRAIO:
def __init__(self):
super().__init__()
self.offset = 0
self.bytez = self.get_file_bytes()
self.bytes_ = self.get_bytes()
def seek(self, offset, whence=0):
assert whence == 0
@@ -42,31 +43,23 @@ class GHIDRAIO:
def read(self, size):
logger.debug("reading 0x%x bytes at 0x%x (ea: 0x%x)", size, self.offset, currentProgram().getImageBase().add(self.offset).getOffset()) # type: ignore [name-defined] # noqa: F821
b_len = len(self.bytez)
if size > b_len - self.offset:
if size > len(self.bytes_) - self.offset:
logger.debug("cannot read 0x%x bytes at 0x%x (ea: BADADDR)", size, self.offset)
return b""
else:
read_bytes = b""
read = [
capa.features.extractors.ghidra.helpers.fix_byte(b)
for b in self.bytez[self.offset : self.offset + size]
]
for b in read:
read_bytes = read_bytes + b
return read_bytes
return self.bytes_[self.offset : self.offset + size]
def close(self):
return
def get_file_bytes(self):
fbytes = currentProgram().getMemory().getAllFileBytes()[0] # type: ignore [name-defined] # noqa: F821
bytez = b""
for i in range(fbytes.getSize()):
# getOriginalByte() allows for raw file parsing on the Ghidra side
# other functions will fail as Ghidra will think that it's reading uninitialized memory
bytez = bytez + capa.features.extractors.ghidra.helpers.fix_byte(fbytes.getOriginalByte(i))
return bytez
def get_bytes(self):
file_bytes = currentProgram().getMemory().getAllFileBytes()[0] # type: ignore [name-defined] # noqa: F821
# getOriginalByte() allows for raw file parsing on the Ghidra side
# other functions will fail as Ghidra will think that it's reading uninitialized memory
bytes_ = [file_bytes.getOriginalByte(i) for i in range(file_bytes.getSize())]
return capa.features.extractors.ghidra.helpers.ints_to_bytes(bytes_)
def is_supported_ghidra_version():