diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 4d14d41a..94032cff 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -206,3 +206,54 @@ jobs: GHIDRA_INSTALL_DIR: ${{ github.workspace }}/.github/ghidra/ghidra_${{ matrix.ghidra-version }}_PUBLIC run: pytest -v tests/test_ghidra_features.py + idalib-tests: + name: IDA ${{ matrix.ida.version }} tests for ${{ matrix.python-version }} + runs-on: ubuntu-22.04 + needs: [tests] + env: + IDA_LICENSE_ID: ${{ secrets.IDA_LICENSE_ID }} + strategy: + fail-fast: false + matrix: + python-version: ["3.10", "3.13"] + ida: + - version: 9.0 + slug: "release/9.0/ida-essential/ida-essential_90_x64linux.run" + - version: 9.1 + slug: "release/9.1/ida-essential/ida-essential_91_x64linux.run" + - version: 9.2 + slug: "release/9.2/ida-essential/ida-essential_92_x64linux.run" + steps: + - name: Checkout capa with submodules + # do only run if IDA_LICENSE_ID is available, have to do this in every step, see https://github.com/orgs/community/discussions/26726#discussioncomment-3253118 + if: ${{ env.IDA_LICENSE_ID != 0 }} + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 + with: + submodules: recursive + - name: Set up Python ${{ matrix.python-version }} + if: ${{ env.IDA_LICENSE_ID != 0 }} + uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + with: + python-version: ${{ matrix.python-version }} + - name: Setup uv + if: ${{ env.IDA_LICENSE_ID != 0 }} + uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # v7.2.0 + - name: Install dependencies + if: ${{ env.IDA_LICENSE_ID != 0 }} + run: sudo apt-get install -y libyaml-dev + - name: Install capa + if: ${{ env.IDA_LICENSE_ID != 0 }} + run: | + pip install -r requirements.txt + pip install -e .[dev,scripts] + pip install idapro + - name: Install IDA ${{ matrix.ida.version }} + if: ${{ env.IDA_LICENSE_ID != 0 }} + run: | + uv run hcli --disable-updates ida install --download-id ${{ matrix.ida.slug }} --license-id ${{ secrets.IDA_LICENSE_ID }} --set-default --yes + env: + HCLI_API_KEY: ${{ secrets.HCLI_API_KEY }} + IDA_LICENSE_ID: ${{ secrets.IDA_LICENSE_ID }} + - name: Run tests + if: ${{ env.IDA_LICENSE_ID != 0 }} + run: pytest -v tests/test_idalib_features.py # explicitly refer to the idalib tests for performance. other tests run above. diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 262b600e..d26911c8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -138,6 +138,7 @@ repos: - "--ignore=tests/test_ghidra_features.py" - "--ignore=tests/test_ida_features.py" - "--ignore=tests/test_viv_features.py" + - "--ignore=tests/test_idalib_features.py" - "--ignore=tests/test_main.py" - "--ignore=tests/test_scripts.py" always_run: true diff --git a/CHANGELOG.md b/CHANGELOG.md index b5455117..784b2aed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -58,6 +58,7 @@ Additionally a Binary Ninja bug has been fixed. Released binaries now include AR ### New Features - ci: add support for arm64 binary releases +- tests: run tests against IDA via idalib @williballenthin #2742 ### Breaking Changes diff --git a/capa/features/extractors/ida/function.py b/capa/features/extractors/ida/function.py index 30c16a1e..6ff1f28d 100644 --- a/capa/features/extractors/ida/function.py +++ b/capa/features/extractors/ida/function.py @@ -18,6 +18,7 @@ import idaapi import idautils import capa.features.extractors.ida.helpers +from capa.features.file import FunctionName from capa.features.common import Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress from capa.features.extractors import loops @@ -50,10 +51,39 @@ def extract_recursive_call(fh: FunctionHandle): yield Characteristic("recursive call"), fh.address +def extract_function_name(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]: + ea = fh.inner.start_ea + name = idaapi.get_name(ea) + if name.startswith("sub_"): + # skip default names, like "sub_401000" + return + + yield FunctionName(name), fh.address + if name.startswith("_"): + # some linkers may prefix linked routines with a `_` to avoid name collisions. + # extract features for both the mangled and un-mangled representations. + # e.g. `_fwrite` -> `fwrite` + # see: https://stackoverflow.com/a/2628384/87207 + yield FunctionName(name[1:]), fh.address + + +def extract_function_alternative_names(fh: FunctionHandle): + """Get all alternative names for an address.""" + + for aname in capa.features.extractors.ida.helpers.get_function_alternative_names(fh.inner.start_ea): + yield FunctionName(aname), fh.address + + def extract_features(fh: FunctionHandle) -> Iterator[tuple[Feature, Address]]: for func_handler in FUNCTION_HANDLERS: for feature, addr in func_handler(fh): yield feature, addr -FUNCTION_HANDLERS = (extract_function_calls_to, extract_function_loop, extract_recursive_call) +FUNCTION_HANDLERS = ( + extract_function_calls_to, + extract_function_loop, + extract_recursive_call, + extract_function_name, + extract_function_alternative_names, +) diff --git a/capa/features/extractors/ida/helpers.py b/capa/features/extractors/ida/helpers.py index 365a2067..ea0b21c8 100644 --- a/capa/features/extractors/ida/helpers.py +++ b/capa/features/extractors/ida/helpers.py @@ -20,6 +20,7 @@ import idaapi import ida_nalt import idautils import ida_bytes +import ida_funcs import ida_segment from capa.features.address import AbsoluteVirtualAddress @@ -436,3 +437,16 @@ def is_basic_block_return(bb: idaapi.BasicBlock) -> bool: def has_sib(oper: idaapi.op_t) -> bool: # via: https://reverseengineering.stackexchange.com/a/14300 return oper.specflag1 == 1 + + +def find_alternative_names(cmt: str): + for line in cmt.split("\n"): + if line.startswith("Alternative name is '") and line.endswith("'"): + name = line[len("Alternative name is '") : -1] # Extract name between quotes + yield name + + +def get_function_alternative_names(fva: int): + """Get all alternative names for an address.""" + yield from find_alternative_names(ida_bytes.get_cmt(fva, False) or "") + yield from find_alternative_names(ida_funcs.get_func_cmt(idaapi.get_func(fva), False) or "") diff --git a/capa/features/extractors/ida/insn.py b/capa/features/extractors/ida/insn.py index 0e92b21f..86fd14b8 100644 --- a/capa/features/extractors/ida/insn.py +++ b/capa/features/extractors/ida/insn.py @@ -22,6 +22,7 @@ import idautils import capa.features.extractors.helpers import capa.features.extractors.ida.helpers +from capa.features.file import FunctionName from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic from capa.features.address import Address, AbsoluteVirtualAddress @@ -129,8 +130,8 @@ def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) # not a function (start) return - if target_func.flags & idaapi.FUNC_LIB: - name = idaapi.get_name(target_func.start_ea) + name = idaapi.get_name(target_func.start_ea) + if target_func.flags & idaapi.FUNC_LIB or not name.startswith("sub_"): yield API(name), ih.address if name.startswith("_"): # some linkers may prefix linked routines with a `_` to avoid name collisions. @@ -139,6 +140,10 @@ def extract_insn_api_features(fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle) # see: https://stackoverflow.com/a/2628384/87207 yield API(name[1:]), ih.address + for altname in capa.features.extractors.ida.helpers.get_function_alternative_names(target_func.start_ea): + yield FunctionName(altname), ih.address + yield API(altname), ih.address + def extract_insn_number_features( fh: FunctionHandle, bbh: BBHandle, ih: InsnHandle diff --git a/capa/loader.py b/capa/loader.py index c5446897..c0996610 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import io import os import logging import datetime @@ -23,24 +22,13 @@ from pathlib import Path from rich.console import Console from typing_extensions import assert_never -import capa.perf import capa.rules -import capa.engine -import capa.helpers import capa.version -import capa.render.json -import capa.rules.cache -import capa.render.default -import capa.render.verbose import capa.features.common import capa.features.freeze as frz -import capa.render.vverbose import capa.features.extractors -import capa.render.result_document import capa.render.result_document as rdoc import capa.features.extractors.common -import capa.features.extractors.base_extractor -import capa.features.extractors.cape.extractor from capa.rules import RuleSet from capa.engine import MatchResults from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError @@ -346,12 +334,24 @@ def get_extractor( import capa.features.extractors.ida.extractor logger.debug("idalib: opening database...") - # idalib writes to stdout (ugh), so we have to capture that - # so as not to screw up structured output. - with capa.helpers.stdout_redirector(io.BytesIO()): - with console.status("analyzing program...", spinner="dots"): - if idapro.open_database(str(input_path), run_auto_analysis=True): - raise RuntimeError("failed to analyze input file") + idapro.enable_console_messages(False) + with console.status("analyzing program...", spinner="dots"): + # we set the primary and secondary Lumina servers to 0.0.0.0 to disable Lumina, + # which sometimes provides bad names, including overwriting names from debug info. + # + # use -R to load resources, which can help us embedded PE files. + # + # return values from open_database: + # 0 - Success + # 2 - User cancelled or 32-64 bit conversion failed + # 4 - Database initialization failed + # -1 - Generic errors (database already open, auto-analysis failed, etc.) + # -2 - User cancelled operation + ret = idapro.open_database( + str(input_path), run_auto_analysis=True, args="-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R" + ) + if ret != 0: + raise RuntimeError("failed to analyze input file") logger.debug("idalib: waiting for analysis...") ida_auto.auto_wait() diff --git a/pyproject.toml b/pyproject.toml index 75b0a3b3..03841c4a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -109,6 +109,13 @@ dependencies = [ ] dynamic = ["version"] +[tool.pytest.ini_options] +filterwarnings = [ + "ignore:builtin type SwigPyPacked has no __module__ attribute:DeprecationWarning", + "ignore:builtin type SwigPyObject has no __module__ attribute:DeprecationWarning", + "ignore:builtin type swigvarlink has no __module__ attribute:DeprecationWarning", +] + [tool.setuptools.dynamic] version = {attr = "capa.version.__version__"} diff --git a/tests/fixtures.py b/tests/fixtures.py index 22303a24..bbe51e77 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - +import logging import contextlib import collections from pathlib import Path @@ -20,7 +20,6 @@ from functools import lru_cache import pytest -import capa.main import capa.features.file import capa.features.insn import capa.features.common @@ -53,6 +52,7 @@ from capa.features.extractors.base_extractor import ( ) from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor +logger = logging.getLogger(__name__) CD = Path(__file__).resolve().parent DOTNET_DIR = CD / "data" / "dotnet" DNFILE_TESTFILES = DOTNET_DIR / "dnfile-testfiles" @@ -200,6 +200,73 @@ def get_binja_extractor(path: Path): return extractor +# we can't easily cache this because the extractor relies on global state (the opened database) +# which also has to be closed elsewhere. so, the idalib tests will just take a little bit to run. +def get_idalib_extractor(path: Path): + import capa.features.extractors.ida.idalib as idalib + + if not idalib.has_idalib(): + raise RuntimeError("cannot find IDA idalib module.") + + if not idalib.load_idalib(): + raise RuntimeError("failed to load IDA idalib module.") + + import idapro + import ida_auto + + import capa.features.extractors.ida.extractor + + logger.debug("idalib: opening database...") + + idapro.enable_console_messages(False) + + # we set the primary and secondary Lumina servers to 0.0.0.0 to disable Lumina, + # which sometimes provides bad names, including overwriting names from debug info. + # + # use -R to load resources, which can help us embedded PE files. + # + # return values from open_database: + # 0 - Success + # 2 - User cancelled or 32-64 bit conversion failed + # 4 - Database initialization failed + # -1 - Generic errors (database already open, auto-analysis failed, etc.) + # -2 - User cancelled operation + ret = idapro.open_database( + str(path), run_auto_analysis=True, args="-Olumina:host=0.0.0.0 -Osecondary_lumina:host=0.0.0.0 -R" + ) + if ret != 0: + raise RuntimeError("failed to analyze input file") + + logger.debug("idalib: waiting for analysis...") + ida_auto.auto_wait() + logger.debug("idalib: opened database.") + + extractor = capa.features.extractors.ida.extractor.IdaFeatureExtractor() + fixup_idalib(path, extractor) + return extractor + + +def fixup_idalib(path: Path, extractor): + """ + IDA fixups to overcome differences between backends + """ + import idaapi + import ida_funcs + + def remove_library_id_flag(fva): + f = idaapi.get_func(fva) + f.flags &= ~ida_funcs.FUNC_LIB + ida_funcs.update_func(f) + + if "kernel32-64" in path.name: + # remove (correct) library function id, so we can test x64 thunk + remove_library_id_flag(0x1800202B0) + + if "al-khaser_x64" in path.name: + # remove (correct) library function id, so we can test x64 nested thunk + remove_library_id_flag(0x14004B4F0) + + @lru_cache(maxsize=1) def get_cape_extractor(path): from capa.helpers import load_json_from_path @@ -914,20 +981,8 @@ FEATURE_PRESENCE_TESTS = sorted( ("mimikatz", "function=0x4556E5", capa.features.insn.API("advapi32.LsaQueryInformationPolicy"), False), ("mimikatz", "function=0x4556E5", capa.features.insn.API("LsaQueryInformationPolicy"), True), # insn/api: x64 - ( - "kernel32-64", - "function=0x180001010", - capa.features.insn.API("RtlVirtualUnwind"), - True, - ), ("kernel32-64", "function=0x180001010", capa.features.insn.API("RtlVirtualUnwind"), True), # insn/api: x64 thunk - ( - "kernel32-64", - "function=0x1800202B0", - capa.features.insn.API("RtlCaptureContext"), - True, - ), ("kernel32-64", "function=0x1800202B0", capa.features.insn.API("RtlCaptureContext"), True), # insn/api: x64 nested thunk ("al-khaser x64", "function=0x14004B4F0", capa.features.insn.API("__vcrt_GetModuleHandle"), True), @@ -1015,20 +1070,20 @@ FEATURE_PRESENCE_TESTS = sorted( ("pma16-01", "file", OS(OS_WINDOWS), True), ("pma16-01", "file", OS(OS_LINUX), False), ("mimikatz", "file", OS(OS_WINDOWS), True), - ("pma16-01", "function=0x404356", OS(OS_WINDOWS), True), - ("pma16-01", "function=0x404356,bb=0x4043B9", OS(OS_WINDOWS), True), + ("pma16-01", "function=0x401100", OS(OS_WINDOWS), True), + ("pma16-01", "function=0x401100,bb=0x401130", OS(OS_WINDOWS), True), ("mimikatz", "function=0x40105D", OS(OS_WINDOWS), True), ("pma16-01", "file", Arch(ARCH_I386), True), ("pma16-01", "file", Arch(ARCH_AMD64), False), ("mimikatz", "file", Arch(ARCH_I386), True), - ("pma16-01", "function=0x404356", Arch(ARCH_I386), True), - ("pma16-01", "function=0x404356,bb=0x4043B9", Arch(ARCH_I386), True), + ("pma16-01", "function=0x401100", Arch(ARCH_I386), True), + ("pma16-01", "function=0x401100,bb=0x401130", Arch(ARCH_I386), True), ("mimikatz", "function=0x40105D", Arch(ARCH_I386), True), ("pma16-01", "file", Format(FORMAT_PE), True), ("pma16-01", "file", Format(FORMAT_ELF), False), ("mimikatz", "file", Format(FORMAT_PE), True), # format is also a global feature - ("pma16-01", "function=0x404356", Format(FORMAT_PE), True), + ("pma16-01", "function=0x401100", Format(FORMAT_PE), True), ("mimikatz", "function=0x456BB9", Format(FORMAT_PE), True), # elf support ("7351f.elf", "file", OS(OS_LINUX), True), diff --git a/tests/test_idalib_features.py b/tests/test_idalib_features.py new file mode 100644 index 00000000..8604b94e --- /dev/null +++ b/tests/test_idalib_features.py @@ -0,0 +1,86 @@ +# Copyright 2020 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import logging +from pathlib import Path + +import pytest +import fixtures + +import capa.features.extractors.ida.idalib +from capa.features.file import FunctionName +from capa.features.insn import API +from capa.features.common import Characteristic + +logger = logging.getLogger(__name__) + +idalib_present = capa.features.extractors.ida.idalib.has_idalib() +if idalib_present: + try: + import idapro # noqa: F401 [imported but unused] + import ida_kernwin + + kernel_version: str = ida_kernwin.get_kernel_version() + except ImportError: + idalib_present = False + kernel_version = "0.0" + + +@pytest.mark.skipif(idalib_present is False, reason="Skip idalib tests if the idalib Python API is not installed") +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_PRESENCE_TESTS + fixtures.FEATURE_SYMTAB_FUNC_TESTS, + indirect=["sample", "scope"], +) +def test_idalib_features(sample: Path, scope, feature, expected): + if kernel_version in {"9.0", "9.1"} and sample.name.startswith("2bf18d"): + if isinstance(feature, (API, FunctionName)) and feature.value == "__libc_connect": + # see discussion here: https://github.com/mandiant/capa/pull/2742#issuecomment-3674146335 + # + # > i confirmed that there were changes in 9.2 related to the ELF loader handling names, + # > so I think its reasonable to conclude that 9.1 and older had a bug that + # > prevented this name from surfacing. + pytest.xfail(f"IDA {kernel_version} does not extract all ELF symbols") + + if kernel_version in {"9.0"} and sample.name.startswith("Practical Malware Analysis Lab 12-04.exe_"): + if isinstance(feature, Characteristic) and feature.value == "embedded pe": + # see discussion here: https://github.com/mandiant/capa/pull/2742#issuecomment-3667086165 + # + # idalib for IDA 9.0 doesn't support argv arguments, so we can't ask that resources are loaded + pytest.xfail("idalib 9.0 does not support loading resource segments") + + try: + fixtures.do_test_feature_presence(fixtures.get_idalib_extractor, sample, scope, feature, expected) + finally: + logger.debug("closing database...") + import idapro + + idapro.close_database(save=False) + logger.debug("closed database.") + + +@pytest.mark.skipif(idalib_present is False, reason="Skip idalib tests if the idalib Python API is not installed") +@fixtures.parametrize( + "sample,scope,feature,expected", + fixtures.FEATURE_COUNT_TESTS, + indirect=["sample", "scope"], +) +def test_idalib_feature_counts(sample, scope, feature, expected): + try: + fixtures.do_test_feature_count(fixtures.get_idalib_extractor, sample, scope, feature, expected) + finally: + logger.debug("closing database...") + import idapro + + idapro.close_database(save=False) + logger.debug("closed database.")