From 4a698ffdff6cfc2e25a73db6afab9bd941e54a22 Mon Sep 17 00:00:00 2001 From: Xusheng Date: Fri, 24 Feb 2023 12:15:27 +0800 Subject: [PATCH] Add a Binary Ninja backend for capa --- .github/pyinstaller/pyinstaller.spec | 1 + CHANGELOG.md | 2 + capa/features/extractors/binja/__init__.py | 0 capa/features/extractors/binja/basicblock.py | 148 ++++ capa/features/extractors/binja/extractor.py | 68 ++ capa/features/extractors/binja/file.py | 189 +++++ .../extractors/binja/find_binja_api.py | 32 + capa/features/extractors/binja/function.py | 94 +++ capa/features/extractors/binja/global_.py | 55 ++ capa/features/extractors/binja/helpers.py | 50 ++ capa/features/extractors/binja/insn.py | 652 ++++++++++++++++++ capa/main.py | 30 +- tests/fixtures.py | 23 + tests/test_binja_features.py | 38 + 14 files changed, 1381 insertions(+), 1 deletion(-) create mode 100644 capa/features/extractors/binja/__init__.py create mode 100644 capa/features/extractors/binja/basicblock.py create mode 100644 capa/features/extractors/binja/extractor.py create mode 100644 capa/features/extractors/binja/file.py create mode 100644 capa/features/extractors/binja/find_binja_api.py create mode 100644 capa/features/extractors/binja/function.py create mode 100644 capa/features/extractors/binja/global_.py create mode 100644 capa/features/extractors/binja/helpers.py create mode 100644 capa/features/extractors/binja/insn.py create mode 100644 tests/test_binja_features.py diff --git a/.github/pyinstaller/pyinstaller.spec b/.github/pyinstaller/pyinstaller.spec index 0e614055..7d90e966 100644 --- a/.github/pyinstaller/pyinstaller.spec +++ b/.github/pyinstaller/pyinstaller.spec @@ -61,6 +61,7 @@ a = Analysis( "qt5", "pyqtwebengine", "pyasn1", + "binaryninja", ], ) diff --git a/CHANGELOG.md b/CHANGELOG.md index 443de56b..e07cf82f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ### New Features +- extractor: add Binary Ninja feature extractor @xusheng6 + ### Breaking Changes ### New Rules (12) diff --git a/capa/features/extractors/binja/__init__.py b/capa/features/extractors/binja/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/capa/features/extractors/binja/basicblock.py b/capa/features/extractors/binja/basicblock.py new file mode 100644 index 00000000..297094d4 --- /dev/null +++ b/capa/features/extractors/binja/basicblock.py @@ -0,0 +1,148 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import sys +import string +import struct +from typing import Tuple, Iterator + +from binaryninja import Function, Variable +from binaryninja import BasicBlock as BinjaBasicBlock +from binaryninja import ( + BinaryView, + VariableSourceType, + MediumLevelILSetVar, + MediumLevelILOperation, + MediumLevelILInstruction, +) + +from capa.features.common import Feature, Characteristic +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.basicblock import BasicBlock +from capa.features.extractors.helpers import MIN_STACKSTRING_LEN +from capa.features.extractors.binja.helpers import DisassemblyInstruction +from capa.features.extractors.base_extractor import BBHandle, FunctionHandle + + +def get_printable_len(il: MediumLevelILSetVar) -> int: + """Return string length if all operand bytes are ascii or utf16-le printable""" + width = il.dest.type.width + value = il.src.value.value + + if width == 1: + chars = struct.pack(" bool: + """verify instruction moves immediate onto stack""" + if il.operation != MediumLevelILOperation.MLIL_SET_VAR: + return False + + if il.src.operation != MediumLevelILOperation.MLIL_CONST: + return False + + if not il.dest.source_type == VariableSourceType.StackVariableSourceType: + return False + + return True + + +def bb_contains_stackstring(f: Function, bb: BinjaBasicBlock) -> bool: + """check basic block for stackstring indicators + + true if basic block contains enough moves of constant bytes to the stack + """ + count = 0 + mlil_bbs = [mlil_bb for mlil_bb in bb.function.mlil_basic_blocks if mlil_bb.source_block.start == bb.start] + for mlil_bb in mlil_bbs: + for il in mlil_bb: + if is_mov_imm_to_stack(il): + count += get_printable_len(il) + if count > MIN_STACKSTRING_LEN: + return True + return False + + +def extract_bb_stackstring(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: + """extract stackstring indicators from basic block""" + bb: BinjaBasicBlock = bbh.inner + if bb_contains_stackstring(fh.inner, bbh.inner): + yield Characteristic("stack string"), bbh.address + + +def extract_bb_tight_loop(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: + """extract tight loop indicators from a basic block""" + bb: BinjaBasicBlock = bbh.inner + for edge in bb.outgoing_edges: + if edge.target.start == bb.start: + yield Characteristic("tight loop"), bbh.address + + +def extract_features(fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: + """extract basic block features""" + for bb_handler in BASIC_BLOCK_HANDLERS: + for feature, addr in bb_handler(fh, bbh): + yield feature, addr + yield BasicBlock(), bbh.address + + +BASIC_BLOCK_HANDLERS = ( + extract_bb_tight_loop, + extract_bb_stackstring, +) + + +def main(): + if len(sys.argv) < 2: + return + + import pprint + + from binaryninja import BinaryViewType + + bv: BinaryView = BinaryViewType.get_view_of_file(sys.argv[1]) + if bv is None: + return + + features = [] + for f in bv.functions: + fh = FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f) + for bb in f.basic_blocks: + bbh = BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=bb) + features.extend(list(extract_features(fh, bbh))) + + import pprint + + pprint.pprint(features) + + +if __name__ == "__main__": + main() diff --git a/capa/features/extractors/binja/extractor.py b/capa/features/extractors/binja/extractor.py new file mode 100644 index 00000000..17e7719a --- /dev/null +++ b/capa/features/extractors/binja/extractor.py @@ -0,0 +1,68 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +from typing import List, Tuple, Iterator + +import binaryninja as binja + +import capa.features.extractors.elf +import capa.features.extractors.binja.file +import capa.features.extractors.binja.insn +import capa.features.extractors.binja.global_ +import capa.features.extractors.binja.function +import capa.features.extractors.binja.basicblock +from capa.features.common import Feature +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor + + +class BinjaFeatureExtractor(FeatureExtractor): + def __init__(self, bv: binja.BinaryView): + super().__init__() + self.bv = bv + self.global_features: List[Tuple[Feature, Address]] = [] + self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv)) + self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv)) + self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv)) + + def get_base_address(self): + return AbsoluteVirtualAddress(self.bv.start) + + def extract_global_features(self): + yield from self.global_features + + def extract_file_features(self): + yield from capa.features.extractors.binja.file.extract_features(self.bv) + + def get_functions(self) -> Iterator[FunctionHandle]: + for f in self.bv.functions: + yield FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f) + + def extract_function_features(self, fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.binja.function.extract_features(fh) + + def get_basic_blocks(self, fh: FunctionHandle) -> Iterator[BBHandle]: + f: binja.Function = fh.inner + for bb in f.basic_blocks: + yield BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=bb) + + def extract_basic_block_features(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.binja.basicblock.extract_features(fh, bbh) + + def get_instructions(self, fh: FunctionHandle, bbh: BBHandle) -> Iterator[InsnHandle]: + import capa.features.extractors.binja.helpers as binja_helpers + + bb: binja.BasicBlock = bbh.inner + addr = bb.start + + for text, length in bb: + insn = binja_helpers.DisassemblyInstruction(addr, length, text) + yield InsnHandle(address=AbsoluteVirtualAddress(addr), inner=insn) + addr += length + + def extract_insn_features(self, fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle): + yield from capa.features.extractors.binja.insn.extract_features(fh, bbh, ih) diff --git a/capa/features/extractors/binja/file.py b/capa/features/extractors/binja/file.py new file mode 100644 index 00000000..1e54ee5d --- /dev/null +++ b/capa/features/extractors/binja/file.py @@ -0,0 +1,189 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import sys +import struct +from typing import Tuple, Iterator + +from binaryninja import Symbol, Segment, BinaryView, SymbolType, SymbolBinding + +import capa.features.extractors.common +import capa.features.extractors.helpers +import capa.features.extractors.strings +from capa.features.file import Export, Import, Section, FunctionName +from capa.features.common import FORMAT_PE, FORMAT_ELF, Format, String, Feature, Characteristic +from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress, AbsoluteVirtualAddress + + +def check_segment_for_pe(bv: BinaryView, seg: Segment) -> Iterator[Tuple[int, int]]: + """check segment for embedded PE + + adapted for binja from: + https://github.com/vivisect/vivisect/blob/7be4037b1cecc4551b397f840405a1fc606f9b53/PE/carve.py#L19 + """ + mz_xor = [ + ( + capa.features.extractors.helpers.xor_static(b"MZ", i), + capa.features.extractors.helpers.xor_static(b"PE", i), + i, + ) + for i in range(256) + ] + + todo = [] + # If this is the first segment of the binary, skip the first bytes. Otherwise, there will always be a matched + # PE at the start of the binaryview. + start = seg.start + if bv.view_type == "PE" and start == bv.start: + start += 1 + + for mzx, pex, i in mz_xor: + for off, _ in bv.find_all_data(start, seg.end, mzx): + todo.append((off, mzx, pex, i)) + + while len(todo): + off, mzx, pex, i = todo.pop() + + # The MZ header has one field we will check e_lfanew is at 0x3c + e_lfanew = off + 0x3C + + if seg.end < (e_lfanew + 4): + continue + + newoff = struct.unpack(" Iterator[Tuple[Feature, Address]]: + """extract embedded PE features""" + for seg in bv.segments: + for ea, _ in check_segment_for_pe(bv, seg): + yield Characteristic("embedded pe"), FileOffsetAddress(ea) + + +def extract_file_export_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + """extract function exports""" + from capa.features.extractors.binja.helpers import unmangle_c_name + + for sym in bv.get_symbols_of_type(SymbolType.FunctionSymbol): + if sym.binding in [SymbolBinding.GlobalBinding, SymbolBinding.WeakBinding]: + name = sym.short_name + yield Export(name), AbsoluteVirtualAddress(sym.address) + unmangled_name = unmangle_c_name(name) + if name != unmangled_name: + yield Export(unmangled_name), AbsoluteVirtualAddress(sym.address) + + +def extract_file_import_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + """extract function imports + + 1. imports by ordinal: + - modulename.#ordinal + + 2. imports by name, results in two features to support importname-only + matching: + - modulename.importname + - importname + """ + for sym in bv.get_symbols_of_type(SymbolType.ImportAddressSymbol): + lib_name = str(sym.namespace) + addr = AbsoluteVirtualAddress(sym.address) + for name in capa.features.extractors.helpers.generate_symbols(lib_name, sym.short_name): + yield Import(name), addr + + ordinal = sym.ordinal + if ordinal != 0 and (lib_name != ""): + ordinal_name = "#%d" % (ordinal) + for name in capa.features.extractors.helpers.generate_symbols(lib_name, ordinal_name): + yield Import(name), addr + + +def extract_file_section_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + """extract section names""" + for name, section in bv.sections.items(): + yield Section(name), AbsoluteVirtualAddress(section.start) + + +def extract_file_strings(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + """extract ASCII and UTF-16 LE strings""" + for s in bv.strings: + yield String(s.value), FileOffsetAddress(s.start) + + +def extract_file_function_names(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + """ + extract the names of statically-linked library functions. + """ + for sym_name in bv.symbols: + for sym in bv.symbols[sym_name]: + if sym.type == SymbolType.LibraryFunctionSymbol: + name = sym.short_name + yield FunctionName(name), sym.address + if name.startswith("_"): + # some linkers may prefix linked routines with a `_` to avoid name collisions. + # extract features for both the mangled and un-mangled representations. + # e.g. `_fwrite` -> `fwrite` + # see: https://stackoverflow.com/a/2628384/87207 + yield FunctionName(name[1:]), sym.address + + +def extract_file_format(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + view_type = bv.view_type + if view_type in ["PE", "COFF"]: + yield Format(FORMAT_PE), NO_ADDRESS + elif view_type == "ELF": + yield Format(FORMAT_ELF), NO_ADDRESS + elif view_type == "Raw": + # no file type to return when processing a binary file, but we want to continue processing + return + else: + raise NotImplementedError("unexpected file format: %d" % view_type) + + +def extract_features(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + """extract file features""" + for file_handler in FILE_HANDLERS: + for feature, addr in file_handler(bv): + yield feature, addr + + +FILE_HANDLERS = ( + extract_file_export_names, + extract_file_import_names, + extract_file_strings, + extract_file_section_names, + extract_file_embedded_pe, + extract_file_function_names, + extract_file_format, +) + + +def main(): + """ """ + if len(sys.argv) < 2: + return + + from binaryninja import BinaryViewType + + bv: BinaryView = BinaryViewType.get_view_of_file(sys.argv[1]) + if bv is None: + return + + import pprint + + pprint.pprint(list(extract_features(bv))) + + +if __name__ == "__main__": + main() diff --git a/capa/features/extractors/binja/find_binja_api.py b/capa/features/extractors/binja/find_binja_api.py new file mode 100644 index 00000000..bba23e85 --- /dev/null +++ b/capa/features/extractors/binja/find_binja_api.py @@ -0,0 +1,32 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import subprocess + +# When the script gets executed as a standalone executable (via PyInstaller), `import binaryninja` does not work because +# we have excluded the binaryninja module in `pyinstaller.spec`. The trick here is to call the system Python and try +# to find out the path of the binaryninja module that has been installed. +# Note, including the binaryninja module in the `pyintaller.spec` would not work, since the binaryninja module tries to +# find the binaryninja core e.g., `libbinaryninjacore.dylib`, using a relative path. And this does not work when the +# binaryninja module is extracted by the PyInstaller. +code = r""" +from pathlib import Path +import importlib +spec = importlib.util.find_spec('binaryninja') +if spec is not None: + if len(spec.submodule_search_locations) > 0: + path = Path(spec.submodule_search_locations[0]) + print(str(path.parent)) +""" + + +def find_binja_path() -> str: + return subprocess.check_output(["python", "-c", "%s" % code]).decode("ascii").strip() + + +if __name__ == "__main__": + print(find_binja_path()) diff --git a/capa/features/extractors/binja/function.py b/capa/features/extractors/binja/function.py new file mode 100644 index 00000000..d9ff5a71 --- /dev/null +++ b/capa/features/extractors/binja/function.py @@ -0,0 +1,94 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import sys +from typing import Tuple, Iterator + +from binaryninja import Function, BinaryView, LowLevelILOperation + +from capa.features.common import Feature, Characteristic +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors import loops +from capa.features.extractors.base_extractor import FunctionHandle + + +def extract_function_calls_to(fh: FunctionHandle): + """extract callers to a function""" + func: Function = fh.inner + bv: BinaryView = func.view + + for caller in func.caller_sites: + # Everything that is a code reference to the current function is considered a caller, which actually includes + # many other references that are NOT a caller. For example, an instruction `push function_start` will also be + # considered a caller to the function + if caller.llil.operation in [ + LowLevelILOperation.LLIL_CALL, + LowLevelILOperation.LLIL_CALL_STACK_ADJUST, + LowLevelILOperation.LLIL_JUMP, + LowLevelILOperation.LLIL_TAILCALL, + ]: + yield Characteristic("calls to"), AbsoluteVirtualAddress(caller.address) + + +def extract_function_loop(fh: FunctionHandle): + """extract loop indicators from a function""" + func: Function = fh.inner + + edges = [] + + # construct control flow graph + for bb in func.basic_blocks: + for edge in bb.outgoing_edges: + edges.append((bb.start, edge.target.start)) + + if loops.has_loop(edges): + yield Characteristic("loop"), fh.address + + +def extract_recursive_call(fh: FunctionHandle): + """extract recursive function call""" + func: Function = fh.inner + bv: BinaryView = func.view + if bv is None: + return + + for ref in bv.get_code_refs(func.start): + if ref.function == func: + yield Characteristic("recursive call"), fh.address + + +def extract_features(fh: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + for func_handler in FUNCTION_HANDLERS: + for feature, addr in func_handler(fh): + yield feature, addr + + +FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call) + + +def main(): + """ """ + if len(sys.argv) < 2: + return + + from binaryninja import BinaryViewType + + bv: BinaryView = BinaryViewType.get_view_of_file(sys.argv[1]) + if bv is None: + return + + features = [] + for f in bv.functions: + features.extend(list(extract_features(FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f)))) + + import pprint + + pprint.pprint(features) + + +if __name__ == "__main__": + main() diff --git a/capa/features/extractors/binja/global_.py b/capa/features/extractors/binja/global_.py new file mode 100644 index 00000000..434ed43d --- /dev/null +++ b/capa/features/extractors/binja/global_.py @@ -0,0 +1,55 @@ +import logging +import contextlib +from typing import Tuple, Iterator + +from binaryninja import BinaryView + +import capa.features.extractors.elf +from capa.features.common import OS, OS_MACOS, ARCH_I386, ARCH_AMD64, OS_WINDOWS, Arch, Feature +from capa.features.address import NO_ADDRESS, Address + +logger = logging.getLogger(__name__) + + +def extract_os(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + name = bv.platform.name + if "-" in name: + name = name.split("-")[0] + + if name == "windows": + yield OS(OS_WINDOWS), NO_ADDRESS + + elif name == "macos": + yield OS(OS_MACOS), NO_ADDRESS + + elif name in ["linux", "freebsd", "decreee"]: + yield OS(name), NO_ADDRESS + + else: + # we likely end up here: + # 1. handling shellcode, or + # 2. handling a new file format (e.g. macho) + # + # for (1) we can't do much - its shellcode and all bets are off. + # we could maybe accept a further CLI argument to specify the OS, + # but i think this would be rarely used. + # rules that rely on OS conditions will fail to match on shellcode. + # + # for (2), this logic will need to be updated as the format is implemented. + logger.debug("unsupported file format: %s, will not guess OS", name) + return + + +def extract_arch(bv: BinaryView) -> Iterator[Tuple[Feature, Address]]: + arch = bv.arch.name + if arch == "x86_64": + yield Arch(ARCH_AMD64), NO_ADDRESS + elif arch == "x86": + yield Arch(ARCH_I386), NO_ADDRESS + else: + # we likely end up here: + # 1. handling a new architecture (e.g. aarch64) + # + # for (1), this logic will need to be updated as the format is implemented. + logger.debug("unsupported architecture: %s", arch) + return diff --git a/capa/features/extractors/binja/helpers.py b/capa/features/extractors/binja/helpers.py new file mode 100644 index 00000000..499de97b --- /dev/null +++ b/capa/features/extractors/binja/helpers.py @@ -0,0 +1,50 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import re +from typing import List, Callable +from dataclasses import dataclass + +from binaryninja import LowLevelILInstruction +from binaryninja.architecture import InstructionTextToken + + +@dataclass +class DisassemblyInstruction: + address: int + length: int + text: List[InstructionTextToken] + + +LLIL_VISITOR = Callable[[LowLevelILInstruction, LowLevelILInstruction, int], bool] + + +def visit_llil_exprs(il: LowLevelILInstruction, func: LLIL_VISITOR): + # BN does not really support operand index at the disassembly level, so use the LLIL operand index as a substitute. + # Note, this is NOT always guaranteed to be the same as disassembly operand. + for i, op in enumerate(il.operands): + if isinstance(op, LowLevelILInstruction) and func(op, il, i): + visit_llil_exprs(op, func) + + +def unmangle_c_name(name: str) -> str: + # https://learn.microsoft.com/en-us/cpp/build/reference/decorated-names?view=msvc-170#FormatC + # Possible variations for BaseThreadInitThunk: + # @BaseThreadInitThunk@12 + # _BaseThreadInitThunk + # _BaseThreadInitThunk@12 + # It is also possible for a function to have a `Stub` appended to its name: + # _lstrlenWStub@4 + + # A small optimization to avoid running the regex too many times + # TODO: this still increases the unit test execution time from 170s to 200s, should be able to accelerate it + if name[0] in ["@", "_"]: + match = re.match(r"^[@|_](.*?)(Stub)?(@\d+)?$", name) + if match: + return match.group(1) + + return name diff --git a/capa/features/extractors/binja/insn.py b/capa/features/extractors/binja/insn.py new file mode 100644 index 00000000..c676538b --- /dev/null +++ b/capa/features/extractors/binja/insn.py @@ -0,0 +1,652 @@ +# Copyright (C) 2020 Mandiant, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import sys +from typing import Any, Dict, List, Tuple, Iterator, Optional + +from binaryninja import Function +from binaryninja import BasicBlock as BinjaBasicBlock +from binaryninja import ( + BinaryView, + ILRegister, + SymbolType, + BinaryReader, + RegisterValueType, + LowLevelILOperation, + LowLevelILInstruction, + InstructionTextTokenType, +) + +import capa.features.extractors.helpers +import capa.features.extractors.binja.helpers +from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset +from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic +from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.extractors.binja.helpers import DisassemblyInstruction +from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle + +# security cookie checks may perform non-zeroing XORs, these are expected within a certain +# byte range within the first and returning basic blocks, this helps to reduce FP features +SECURITY_COOKIE_BYTES_DELTA = 0x40 + + +# check if a function is a stub function to another function/symbol. The criteria is: +# 1. The function must only have one basic block +# 2. The function must only make one call/jump to another address +# If the function being checked is a stub function, returns the target address. Otherwise, return None. +def is_stub_function(bv: BinaryView, addr: int) -> Optional[int]: + funcs = bv.get_functions_at(addr) + for func in funcs: + if len(func.basic_blocks) != 1: + continue + + call_count = 0 + call_target = None + for il in func.llil.instructions: + if il.operation in [ + LowLevelILOperation.LLIL_CALL, + LowLevelILOperation.LLIL_CALL_STACK_ADJUST, + LowLevelILOperation.LLIL_JUMP, + LowLevelILOperation.LLIL_TAILCALL, + ]: + call_count += 1 + if il.dest.value.type in [ + RegisterValueType.ImportedAddressValue, + RegisterValueType.ConstantValue, + RegisterValueType.ConstantPointerValue, + ]: + call_target = il.dest.value.value + + if call_count == 1 and call_target is not None: + return call_target + + return None + + +def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: + """ + parse instruction API features + + example: + call dword [0x00473038] + """ + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + bv: BinaryView = func.view + + for llil in func.get_llils_at(ih.address): + if llil.operation in [ + LowLevelILOperation.LLIL_CALL, + LowLevelILOperation.LLIL_CALL_STACK_ADJUST, + LowLevelILOperation.LLIL_JUMP, + LowLevelILOperation.LLIL_TAILCALL, + ]: + if llil.dest.value.type not in [ + RegisterValueType.ImportedAddressValue, + RegisterValueType.ConstantValue, + RegisterValueType.ConstantPointerValue, + ]: + continue + address = llil.dest.value.value + candidate_addrs = [address] + stub_addr = is_stub_function(bv, address) + if stub_addr is not None: + candidate_addrs.append(stub_addr) + + for address in candidate_addrs: + sym = func.view.get_symbol_at(address) + if sym is None or sym.type not in [SymbolType.ImportAddressSymbol, SymbolType.ImportedFunctionSymbol]: + continue + + sym_name = sym.short_name + + lib_name = "" + import_lib = bv.lookup_imported_object_library(sym.address) + if import_lib is not None: + lib_name = import_lib[0].name + if lib_name.endswith(".dll"): + lib_name = lib_name[:-4] + elif lib_name.endswith(".so"): + lib_name = lib_name[:-3] + + for name in capa.features.extractors.helpers.generate_symbols(lib_name, sym_name): + yield API(name), ih.address + + if sym_name.startswith("_"): + for name in capa.features.extractors.helpers.generate_symbols(lib_name, sym_name[1:]): + yield API(name), ih.address + + +def extract_insn_number_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """ + parse instruction number features + example: + push 3136B0h ; dwControlCode + """ + from capa.features.extractors.binja.helpers import visit_llil_exprs + + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + bv: BinaryView = func.view + + results: List[Tuple[Any[Number, OperandNumber], Address]] = [] + address_size = func.view.arch.address_size * 8 + + def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool: + if il.operation == LowLevelILOperation.LLIL_LOAD: + return False + + if il.operation not in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]: + return True + + for op in parent.operands: + if isinstance(op, ILRegister) and op.name in ["esp", "ebp", "rsp", "rbp", "sp"]: + return False + elif isinstance(op, LowLevelILInstruction) and op.operation == LowLevelILOperation.LLIL_REG: + if op.src.name in ["esp", "ebp", "rsp", "rbp", "sp"]: + return False + + raw_value = il.value.value + if parent.operation == LowLevelILOperation.LLIL_SUB: + raw_value = -raw_value + + results.append((Number(raw_value), ih.address)) + results.append((OperandNumber(index, raw_value), ih.address)) + + return False + + for llil in func.get_llils_at(ih.address): + visit_llil_exprs(llil, llil_checker) + + for result in results: + yield result + + +def extract_insn_bytes_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: + """ + parse referenced byte sequences + example: + push offset iid_004118d4_IShellLinkA ; riid + """ + from capa.features.extractors.binja.helpers import visit_llil_exprs + + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + bv: BinaryView = func.view + + candidate_addrs = set() + + llil = func.get_llil_at(ih.address) + if llil is None or llil.operation in [LowLevelILOperation.LLIL_CALL, LowLevelILOperation.LLIL_CALL_STACK_ADJUST]: + return + + for ref in bv.get_code_refs_from(ih.address): + if ref == ih.address: + continue + + if len(bv.get_functions_containing(ref)) > 0: + continue + + candidate_addrs.add(ref) + + # collect candidate address by enumerating all integers, https://github.com/Vector35/binaryninja-api/issues/3966 + def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool: + if il.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]: + value = il.value.value + if value > 0: + candidate_addrs.add(value) + return False + + return True + + for llil in func.get_llils_at(ih.address): + visit_llil_exprs(llil, llil_checker) + + for addr in candidate_addrs: + extracted_bytes = bv.read(addr, MAX_BYTES_FEATURE_SIZE) + if extracted_bytes and not capa.features.extractors.helpers.all_zeros(extracted_bytes): + if bv.get_string_at(addr) is None: + # don't extract byte features for obvious strings + yield Bytes(extracted_bytes), ih.address + + +def extract_insn_string_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """ + parse instruction string features + + example: + push offset aAcr ; "ACR > " + """ + from capa.features.extractors.binja.helpers import visit_llil_exprs + + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + bv: BinaryView = func.view + + candidate_addrs = set() + + # collect candidate address from code refs directly + for ref in bv.get_code_refs_from(ih.address): + if ref == ih.address: + continue + + if len(bv.get_functions_containing(ref)) > 0: + continue + + candidate_addrs.add(ref) + + # collect candidate address by enumerating all integers, https://github.com/Vector35/binaryninja-api/issues/3966 + def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool: + if il.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]: + value = il.value.value + if value > 0: + candidate_addrs.add(value) + return False + + return True + + for llil in func.get_llils_at(ih.address): + visit_llil_exprs(llil, llil_checker) + + # Now we have all the candidate address, check them for string or pointer to string + br = BinaryReader(bv) + for addr in candidate_addrs: + found = bv.get_string_at(addr) + if found: + yield String(found.value), ih.address + + br.seek(addr) + pointer = None + if bv.arch.address_size == 4: + pointer = br.read32() + elif bv.arch.address_size == 8: + pointer = br.read64() + + if pointer is not None: + found = bv.get_string_at(pointer) + if found: + yield String(found.value), ih.address + + +def extract_insn_offset_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """ + parse instruction structure offset features + + example: + .text:0040112F cmp [esi+4], ebx + """ + from capa.features.extractors.binja.helpers import visit_llil_exprs + + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + + results: List[Tuple[Any[Offset, OperandOffset], Address]] = [] + address_size = func.view.arch.address_size * 8 + + def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool: + # The most common case, read/write dereference to something like `dword [eax+0x28]` + if il.operation in [LowLevelILOperation.LLIL_ADD, LowLevelILOperation.LLIL_SUB]: + left = il.left + right = il.right + # Exclude offsets based on stack/franme pointers + if left.operation == LowLevelILOperation.LLIL_REG and left.src.name in ["esp", "ebp", "rsp", "rbp", "sp"]: + return True + + if right.operation != LowLevelILOperation.LLIL_CONST: + return True + + raw_value = right.value.value + # If this is not a dereference, then this must be an add and the offset must be in the range \ + # [0, MAX_STRUCTURE_SIZE]. For example, + # add eax, 0x10, + # lea ebx, [eax + 1] + if parent.operation not in [LowLevelILOperation.LLIL_LOAD, LowLevelILOperation.LLIL_STORE]: + if il.operation != LowLevelILOperation.LLIL_ADD or (not 0 < raw_value < MAX_STRUCTURE_SIZE): + return False + + if address_size > 0: + # BN also encodes the constant value as two's complement, we need to restore its original value + value = capa.features.extractors.helpers.twos_complement(raw_value, address_size) + else: + value = raw_value + + results.append((Offset(value), ih.address)) + results.append((OperandOffset(index, value), ih.address)) + return False + + # An edge case: for code like `push dword [esi]`, we need to generate a feature for offset 0x0 + elif il.operation in [LowLevelILOperation.LLIL_LOAD, LowLevelILOperation.LLIL_STORE]: + if il.operands[0].operation == LowLevelILOperation.LLIL_REG: + results.append((Offset(0), ih.address)) + results.append((OperandOffset(index, 0), ih.address)) + return False + + return True + + for llil in func.get_llils_at(ih.address): + visit_llil_exprs(llil, llil_checker) + + for result in results: + yield result + + +def is_nzxor_stack_cookie(f: Function, bb: BinjaBasicBlock, llil: LowLevelILInstruction) -> bool: + """check if nzxor exists within stack cookie delta""" + # TODO: we can do a much accurate analysi using LLIL SSA + + reg_names = [] + if llil.left.operation == LowLevelILOperation.LLIL_REG: + reg_names.append(llil.left.src.name) + + if llil.right.operation == LowLevelILOperation.LLIL_REG: + reg_names.append(llil.right.src.name) + + # stack cookie reg should be stack/frame pointer + if not any(reg in ["ebp", "esp", "rbp", "rsp", "sp"] for reg in reg_names): + return False + + # expect security cookie init in first basic block within first bytes (instructions) + if len(bb.incoming_edges) == 0 and llil.address < (bb.start + SECURITY_COOKIE_BYTES_DELTA): + return True + + # ... or within last bytes (instructions) before a return + if len(bb.outgoing_edges) == 0 and llil.address > (bb.end - SECURITY_COOKIE_BYTES_DELTA): + return True + + return False + + +def extract_insn_nzxor_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """ + parse instruction non-zeroing XOR instruction + ignore expected non-zeroing XORs, e.g. security cookies + """ + from capa.features.extractors.binja.helpers import visit_llil_exprs + + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + + results = [] + + def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool: + # If the two operands of the xor instruction are the same, the LLIL will be translated to other instructions, + # e.g., , (LLIL_SET_REG). So we do not need to check whether the two operands are the same. + if il.operation == LowLevelILOperation.LLIL_XOR: + # Exclude cases related to the stack cookie + if is_nzxor_stack_cookie(fh.inner, bbh.inner, il): + return False + results.append((Characteristic("nzxor"), ih.address)) + return False + else: + return True + + for llil in func.get_llils_at(ih.address): + visit_llil_exprs(llil, llil_checker) + + for result in results: + yield result + + +def extract_insn_mnemonic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """parse instruction mnemonic features""" + insn: DisassemblyInstruction = ih.inner + yield Mnemonic(insn.text[0].text), ih.address + + +def extract_insn_obfs_call_plus_5_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """ + parse call $+5 instruction from the given instruction. + """ + insn: DisassemblyInstruction = ih.inner + if insn.text[0].text == "call" and insn.text[2].text == "$+5" and insn.length == 5: + yield Characteristic("call $+5"), ih.address + + +def extract_insn_peb_access_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """parse instruction peb access + + fs:[0x30] on x86, gs:[0x60] on x64 + """ + from capa.features.extractors.binja.helpers import visit_llil_exprs + + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + + results = [] + + def llil_checker(il: LowLevelILInstruction, parent: LowLevelILOperation, index: int) -> bool: + if il.operation != LowLevelILOperation.LLIL_LOAD: + return True + + src = il.src + if src.operation != LowLevelILOperation.LLIL_ADD: + return True + + left = src.left + right = src.right + + if left.operation != LowLevelILOperation.LLIL_REG: + return True + + reg = left.src.name + + if right.operation != LowLevelILOperation.LLIL_CONST: + return True + + value = right.value.value + if not (reg, value) in (("fsbase", 0x30), ("gsbase", 0x60)): + return True + + results.append((Characteristic("peb access"), ih.address)) + return False + + for llil in func.get_llils_at(ih.address): + visit_llil_exprs(llil, llil_checker) + + for result in results: + yield result + + +def extract_insn_segment_access_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """parse instruction fs or gs access""" + from capa.features.extractors.binja.helpers import visit_llil_exprs + + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + + results = [] + + def llil_checker(il: LowLevelILInstruction, parent: LowLevelILInstruction, index: int) -> bool: + if il.operation == LowLevelILOperation.LLIL_REG: + reg = il.src.name + if reg == "fsbase": + results.append((Characteristic("fs access"), ih.address)) + return False + elif reg == "gsbase": + results.append((Characteristic("gs access"), ih.address)) + return False + return False + + return True + + for llil in func.get_llils_at(ih.address): + visit_llil_exprs(llil, llil_checker) + + for result in results: + yield result + + +def extract_insn_cross_section_cflow( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """inspect the instruction for a CALL or JMP that crosses section boundaries""" + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + bv: BinaryView = func.view + + if bv is None: + return + + seg1 = bv.get_segment_at(ih.address) + sections1 = bv.get_sections_at(ih.address) + for ref in bv.get_code_refs_from(ih.address): + if len(bv.get_functions_at(ref)) == 0: + continue + + seg2 = bv.get_segment_at(ref) + sections2 = bv.get_sections_at(ref) + if seg1 != seg2 or sections1 != sections2: + yield Characteristic("cross section flow"), ih.address + + +def extract_function_calls_from(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) -> Iterator[Tuple[Feature, Address]]: + """extract functions calls from features + + most relevant at the function scope, however, its most efficient to extract at the instruction scope + """ + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + bv: BinaryView = func.view + + if bv is None: + return + + for il in func.get_llils_at(ih.address): + if il.operation not in [ + LowLevelILOperation.LLIL_CALL, + LowLevelILOperation.LLIL_CALL_STACK_ADJUST, + LowLevelILOperation.LLIL_TAILCALL, + ]: + continue + + dest = il.dest + if dest.operation == LowLevelILOperation.LLIL_CONST_PTR: + value = dest.value.value + yield Characteristic("calls from"), AbsoluteVirtualAddress(value) + elif dest.operation == LowLevelILOperation.LLIL_CONST: + yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value) + elif dest.operation == LowLevelILOperation.LLIL_LOAD: + indirect_src = dest.src + if indirect_src.operation == LowLevelILOperation.LLIL_CONST_PTR: + value = indirect_src.value.value + yield Characteristic("calls from"), AbsoluteVirtualAddress(value) + elif indirect_src.operation == LowLevelILOperation.LLIL_CONST: + yield Characteristic("calls from"), AbsoluteVirtualAddress(indirect_src.value) + elif dest.operation == LowLevelILOperation.LLIL_REG: + if dest.value.type in [ + RegisterValueType.ImportedAddressValue, + RegisterValueType.ConstantValue, + RegisterValueType.ConstantPointerValue, + ]: + yield Characteristic("calls from"), AbsoluteVirtualAddress(dest.value.value) + + +def extract_function_indirect_call_characteristic_features( + fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle +) -> Iterator[Tuple[Feature, Address]]: + """extract indirect function calls (e.g., call eax or call dword ptr [edx+4]) + does not include calls like => call ds:dword_ABD4974 + + most relevant at the function or basic block scope; + however, its most efficient to extract at the instruction scope + """ + insn: DisassemblyInstruction = ih.inner + func: Function = fh.inner + + llil = func.get_llil_at(ih.address) + if llil is None or llil.operation not in [ + LowLevelILOperation.LLIL_CALL, + LowLevelILOperation.LLIL_CALL_STACK_ADJUST, + LowLevelILOperation.LLIL_TAILCALL, + ]: + return + + if llil.dest.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]: + return + + if llil.dest.operation == LowLevelILOperation.LLIL_LOAD: + src = llil.dest.src + if src.operation in [LowLevelILOperation.LLIL_CONST, LowLevelILOperation.LLIL_CONST_PTR]: + return + + yield Characteristic("indirect call"), ih.address + + +def extract_features(f: FunctionHandle, bbh: BBHandle, insn: InsnHandle) -> Iterator[Tuple[Feature, Address]]: + """extract instruction features""" + for inst_handler in INSTRUCTION_HANDLERS: + for feature, ea in inst_handler(f, bbh, insn): + yield feature, ea + + +INSTRUCTION_HANDLERS = ( + extract_insn_api_features, + extract_insn_number_features, + extract_insn_bytes_features, + extract_insn_string_features, + extract_insn_offset_features, + extract_insn_nzxor_characteristic_features, + extract_insn_mnemonic_features, + extract_insn_obfs_call_plus_5_characteristic_features, + extract_insn_peb_access_characteristic_features, + extract_insn_cross_section_cflow, + extract_insn_segment_access_features, + extract_function_calls_from, + extract_function_indirect_call_characteristic_features, +) + + +def main(): + """ """ + if len(sys.argv) < 2: + return + + import pprint + + from binaryninja import BinaryViewType + + import capa.features.extractors.binja.helpers as binja_helpers + + bv: BinaryView = BinaryViewType.get_view_of_file(sys.argv[1]) + if bv is None: + return + + features = [] + for f in bv.functions: + fh = FunctionHandle(address=AbsoluteVirtualAddress(f.start), inner=f) + for bb in f.basic_blocks: + bbh = BBHandle(address=AbsoluteVirtualAddress(bb.start), inner=bb) + addr = bb.start + for text, length in bb: + insn = binja_helpers.DisassemblyInstruction(addr, length, text) + insn_handle = InsnHandle(address=AbsoluteVirtualAddress(addr), inner=insn) + addr += length + features.extend(list(extract_features(fh, bbh, insn_handle))) + + import pprint + + pprint.pprint(features) + + +if __name__ == "__main__": + main() diff --git a/capa/main.py b/capa/main.py index ba03c7a4..adfb7e52 100644 --- a/capa/main.py +++ b/capa/main.py @@ -73,6 +73,7 @@ RULES_PATH_DEFAULT_STRING = "(embedded rules)" SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)" BACKEND_VIV = "vivisect" BACKEND_DOTNET = "dotnet" +BACKEND_BINJA = "binja" E_MISSING_RULES = 10 E_MISSING_FILE = 11 @@ -513,6 +514,33 @@ def get_extractor( return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path) + elif backend == BACKEND_BINJA: + from capa.features.extractors.binja.find_binja_api import find_binja_path + + # When we are running as a standalone executable, we cannot directly import binaryninja + # We need to fist find the binja API installation path and add it into sys.path + if is_running_standalone(): + bn_api = find_binja_path() + if os.path.exists(bn_api): + sys.path.append(bn_api) + + try: + from binaryninja import BinaryView, BinaryViewType + except ImportError: + raise RuntimeError( + "Cannot import binaryninja module. Please install the Binary Ninja Python API first: " + "https://docs.binary.ninja/dev/batch.html#install-the-api)." + ) + + import capa.features.extractors.binja.extractor + + with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress): + bv: BinaryView = BinaryViewType.get_view_of_file(path) + if bv is None: + raise RuntimeError("Binary Ninja cannot open file %s" % (path)) + + return capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv) + # default to use vivisect backend else: import capa.features.extractors.viv.extractor @@ -859,7 +887,7 @@ def install_common_args(parser, wanted=None): "--backend", type=str, help="select the backend to use", - choices=(BACKEND_VIV,), + choices=(BACKEND_VIV, BACKEND_BINJA), default=BACKEND_VIV, ) diff --git a/tests/fixtures.py b/tests/fixtures.py index 1c4d2ad3..cd262271 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -158,6 +158,29 @@ def get_dnfile_extractor(path): return extractor +@lru_cache(maxsize=1) +def get_binja_extractor(path): + from binaryninja import Settings, BinaryViewType + + import capa.features.extractors.binja.extractor + + # Workaround for a BN bug: https://github.com/Vector35/binaryninja-api/issues/4051 + settings = Settings() + if path.endswith("kernel32-64.dll_"): + old_pdb = settings.get_bool("pdb.loadGlobalSymbols") + settings.set_bool("pdb.loadGlobalSymbols", False) + bv = BinaryViewType.get_view_of_file(path) + if path.endswith("kernel32-64.dll_"): + settings.set_bool("pdb.loadGlobalSymbols", old_pdb) + + extractor = capa.features.extractors.binja.extractor.BinjaFeatureExtractor(bv) + + # overload the extractor so that the fixture exposes `extractor.path` + setattr(extractor, "path", path) + + return extractor + + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): diff --git a/tests/test_binja_features.py b/tests/test_binja_features.py new file mode 100644 index 00000000..9d6e5052 --- /dev/null +++ b/tests/test_binja_features.py @@ -0,0 +1,38 @@ +# Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at: [package root]/LICENSE.txt +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and limitations under the License. +import fixtures +from fixtures import * + +# We need to skip the binja test if we cannot import binaryninja, e.g., in GitHub CI. +binja_present: bool = False +try: + import binaryninja + + binja_present = True +except ImportError: + pass + + +@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed") +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_PRESENCE_TESTS, + indirect=["sample", "scope"], +) +def test_binja_features(sample, scope, feature, expected): + fixtures.do_test_feature_presence(fixtures.get_binja_extractor, sample, scope, feature, expected) + + +@pytest.mark.skipif(binja_present is False, reason="Skip binja tests if the binaryninja Python API is not installed") +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_COUNT_TESTS, + indirect=["sample", "scope"], +) +def test_binja_feature_counts(sample, scope, feature, expected): + fixtures.do_test_feature_count(fixtures.get_binja_extractor, sample, scope, feature, expected)