add idalib backend

introduce script to detect 3P backends
ref #2376
2025-12-12 15:49:46 -08:00 · 2024-09-20 10:47:21 +00:00 · 2024-09-20 09:03:46 +00:00
9 changed files with 405 additions and 4 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,8 @@

 ### New Features

+- add IDA v9.0 backend via idalib #2376 @williballenthin
+
 ### Breaking Changes

 ### New Rules (0)
--- a/capa/features/extractors/binja/find_binja_api.py
+++ b/capa/features/extractors/binja/find_binja_api.py
@@ -20,9 +20,9 @@ from importlib import util
 spec = util.find_spec('binaryninja')
 if spec is not None:
    if len(spec.submodule_search_locations) > 0:
-            path = Path(spec.submodule_search_locations[0])
-            # encode the path with utf8 then convert to hex, make sure it can be read and restored properly
-            print(str(path.parent).encode('utf8').hex())
+        path = Path(spec.submodule_search_locations[0])
+        # encode the path with utf8 then convert to hex, make sure it can be read and restored properly
+        print(str(path.parent).encode('utf8').hex())
 """


--- a/capa/features/extractors/ida/extractor.py
+++ b/capa/features/extractors/ida/extractor.py
@@ -32,7 +32,9 @@ class IdaFeatureExtractor(StaticFeatureExtractor):
    def __init__(self):
        super().__init__(
            hashes=SampleHashes(
-                md5=ida_nalt.retrieve_input_file_md5(), sha1="(unknown)", sha256=ida_nalt.retrieve_input_file_sha256()
+                md5=ida_nalt.retrieve_input_file_md5().hex(),
+                sha1="(unknown)",
+                sha256=ida_nalt.retrieve_input_file_sha256().hex(),
            )
        )
        self.global_features: List[Tuple[Feature, Address]] = []
--- a/capa/features/extractors/ida/idalib.py
+++ b/capa/features/extractors/ida/idalib.py
@@ -0,0 +1,113 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+import os
+import sys
+import json
+import logging
+import importlib.util
+from typing import Optional
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+def is_idalib_installed() -> bool:
+    try:
+        return importlib.util.find_spec("ida") is not None
+    except ModuleNotFoundError:
+        return False
+
+
+def get_idalib_user_config_path() -> Optional[Path]:
+    """Get the path to the user's config file based on platform following IDA's user directories."""
+    # derived from `py-activate-idalib.py` from IDA v9.0 Beta 4
+
+    if sys.platform == "win32":
+        # On Windows, use the %APPDATA%\Hex-Rays\IDA Pro directory
+        config_dir = Path(os.getenv("APPDATA")) / "Hex-Rays" / "IDA Pro"
+    else:
+        # On macOS and Linux, use ~/.idapro
+        config_dir = Path.home() / ".idapro"
+
+    # Return the full path to the config file (now in JSON format)
+    user_config_path = config_dir / "ida-config.json"
+    if not user_config_path.exists():
+        return None
+    return user_config_path
+
+
+def find_idalib() -> Optional[Path]:
+    config_path = get_idalib_user_config_path()
+    if not config_path:
+        return None
+
+    config = json.loads(config_path.read_text(encoding="utf-8"))
+
+    try:
+        ida_install_dir = Path(config["Paths"]["ida-install-dir"])
+    except KeyError:
+        return None
+
+    if not ida_install_dir.exists():
+        return None
+
+    libname = {
+        "win32": "idalib.dll",
+        "linux": "libidalib.so",
+        "linux2": "libidalib.so",
+        "darwin": "libidalib.dylib",
+    }[sys.platform]
+
+    if not (ida_install_dir / "ida.hlp").is_file():
+        return None
+
+    if not (ida_install_dir / libname).is_file():
+        return None
+
+    idalib_path = ida_install_dir / "idalib" / "python"
+    if not idalib_path.exists():
+        return None
+
+    if not (idalib_path / "ida" / "__init__.py").is_file():
+        return None
+
+    return idalib_path
+
+
+def has_idalib() -> bool:
+    if is_idalib_installed():
+        logger.debug("found installed IDA idalib API")
+        return True
+
+    logger.debug("IDA idalib API not installed, searching...")
+
+    idalib_path = find_idalib()
+    if not idalib_path:
+        logger.debug("failed to find IDA idalib installation")
+
+    logger.debug("found IDA idalib API: %s", idalib_path)
+    return idalib_path is not None
+
+
+def load_idalib() -> bool:
+    try:
+        import ida
+
+        return True
+    except ImportError:
+        idalib_path = find_idalib()
+        if not idalib_path:
+            return False
+
+        sys.path.append(idalib_path.absolute().as_posix())
+        try:
+            import ida  # noqa: F401 unused import
+
+            return True
+        except ImportError:
+            return False
--- a/capa/helpers.py
+++ b/capa/helpers.py
@@ -6,9 +6,12 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import os
+import io
 import sys
 import gzip
+import ctypes
 import inspect
+import tempfile
 import logging
 import contextlib
 import importlib.util
@@ -81,6 +84,47 @@ def assert_never(value) -> NoReturn:
    assert False, f"Unhandled value: {value} ({type(value).__name__})"  # noqa: B011


+# Redirect stdout at the C runtime level,
+# which lets us handle native libraries that spam stdout.
+# via: https://eli.thegreenplace.net/2015/redirecting-all-kinds-of-stdout-in-python/
+LIBC = ctypes.CDLL(None)
+C_STDOUT = ctypes.c_void_p.in_dll(LIBC, "stdout")
+
+
+@contextlib.contextmanager
+def stdout_redirector(stream):
+    # The original fd stdout points to. Usually 1 on POSIX systems.
+    original_stdout_fd = sys.stdout.fileno()
+
+    def _redirect_stdout(to_fd):
+        """Redirect stdout to the given file descriptor."""
+        # Flush the C-level buffer stdout
+        LIBC.fflush(C_STDOUT)
+        # Flush and close sys.stdout - also closes the file descriptor (fd)
+        sys.stdout.close()
+        # Make original_stdout_fd point to the same file as to_fd
+        os.dup2(to_fd, original_stdout_fd)
+        # Create a new sys.stdout that points to the redirected fd
+        sys.stdout = io.TextIOWrapper(os.fdopen(original_stdout_fd, 'wb'))
+
+    # Save a copy of the original stdout fd in saved_stdout_fd
+    saved_stdout_fd = os.dup(original_stdout_fd)
+    try:
+        # Create a temporary file and redirect stdout to it
+        tfile = tempfile.TemporaryFile(mode='w+b')
+        _redirect_stdout(tfile.fileno())
+        # Yield to caller, then redirect stdout back to the saved fd
+        yield
+        _redirect_stdout(saved_stdout_fd)
+        # Copy contents of temporary file to the given stream
+        tfile.flush()
+        tfile.seek(0, io.SEEK_SET)
+        stream.write(tfile.read())
+    finally:
+        tfile.close()
+        os.close(saved_stdout_fd)
+
+
 def load_json_from_path(json_path: Path):
    with gzip.open(json_path, "r") as compressed_report:
        try:
--- a/capa/loader.py
+++ b/capa/loader.py
@@ -5,6 +5,7 @@
 # Unless required by applicable law or agreed to in writing, software distributed under the License
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
+import io
 import os
 import sys
 import logging
@@ -69,6 +70,7 @@ BACKEND_DRAKVUF = "drakvuf"
 BACKEND_VMRAY = "vmray"
 BACKEND_FREEZE = "freeze"
 BACKEND_BINEXPORT2 = "binexport2"
+BACKEND_IDA = "ida"


 class CorruptFile(ValueError):
@@ -321,6 +323,36 @@ def get_extractor(

        return capa.features.extractors.binexport2.extractor.BinExport2FeatureExtractor(be2, buf)

+    elif backend == BACKEND_IDA:
+        import capa.features.extractors.ida.idalib as idalib
+
+        if not idalib.has_idalib():
+            raise RuntimeError(
+                # TODO(williballenthin): add more details here
+                "cannot find IDA idalib  module."
+            )
+
+        if not idalib.load_idalib():
+            raise RuntimeError("failed to load IDA idalib  module.")
+
+        import ida
+        import ida_auto
+
+        import capa.features.extractors.ida.extractor
+
+        logger.debug("idalib: opening database...")
+        # idalib writes to stdout (ugh), so we have to capture that
+        # so as not to screw up structured output.
+        with capa.helpers.stdout_redirector(io.BytesIO()):
+            if ida.open_database(str(input_path), run_auto_analysis=True):
+                raise RuntimeError("failed to analyze input file")
+
+            logger.debug("idalib: waiting for analysis...")
+            ida_auto.auto_wait()
+            logger.debug("idalib: opened database.")
+
+        return capa.features.extractors.ida.extractor.IdaFeatureExtractor()
+
    else:
        raise ValueError("unexpected backend: " + backend)

--- a/capa/main.py
+++ b/capa/main.py
@@ -43,6 +43,7 @@ import capa.features.extractors.common
 from capa.rules import RuleSet
 from capa.engine import MatchResults
 from capa.loader import (
+    BACKEND_IDA,
    BACKEND_VIV,
    BACKEND_CAPE,
    BACKEND_BINJA,
@@ -283,6 +284,7 @@ def install_common_args(parser, wanted=None):
        backends = [
            (BACKEND_AUTO, "(default) detect appropriate backend automatically"),
            (BACKEND_VIV, "vivisect"),
+            (BACKEND_IDA, "IDA via idalib"),
            (BACKEND_PEFILE, "pefile (file features only)"),
            (BACKEND_BINJA, "Binary Ninja"),
            (BACKEND_DOTNET, ".NET"),
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -183,7 +183,9 @@ known_first_party = [
    "binaryninja",
    "flirt",
    "ghidra",
+    "ida",
    "ida_ida",
+    "ida_auto",
    "ida_bytes",
    "ida_entry",
    "ida_funcs",
--- a/scripts/detect-backends.py
+++ b/scripts/detect-backends.py
@@ -0,0 +1,204 @@
+# Copyright (C) 2024 Mandiant, Inc. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at: [package root]/LICENSE.txt
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and limitations under the License.
+
+import os
+import sys
+import logging
+import importlib.util
+from typing import Optional
+from pathlib import Path
+
+import rich
+import rich.table
+
+from capa.features.extractors.ida.idalib import find_idalib, load_idalib, is_idalib_installed
+
+logger = logging.getLogger(__name__)
+
+
+def get_desktop_entry(name: str) -> Optional[Path]:
+    """
+    Find the path for the given XDG Desktop Entry name.
+
+    Like:
+
+        >> get_desktop_entry("com.vector35.binaryninja.desktop")
+        Path("~/.local/share/applications/com.vector35.binaryninja.desktop")
+    """
+    assert sys.platform in ("linux", "linux2")
+    assert name.endswith(".desktop")
+
+    default_data_dirs = f"/usr/share/applications:{Path.home()}/.local/share"
+    data_dirs = os.environ.get("XDG_DATA_DIRS", default_data_dirs)
+    for data_dir in data_dirs.split(":"):
+        applications = Path(data_dir) / "applications"
+        for application in applications.glob("*.desktop"):
+            if application.name == name:
+                return application
+
+    return None
+
+
+def get_binaryninja_path(desktop_entry: Path) -> Optional[Path]:
+    # from: Exec=/home/wballenthin/software/binaryninja/binaryninja %u
+    # to:        /home/wballenthin/software/binaryninja/
+    for line in desktop_entry.read_text(encoding="utf-8").splitlines():
+        if not line.startswith("Exec="):
+            continue
+
+        if not line.endswith("binaryninja %u"):
+            continue
+
+        binaryninja_path = Path(line[len("Exec=") : -len("binaryninja %u")])
+        if not binaryninja_path.exists():
+            return None
+
+        return binaryninja_path
+
+    return None
+
+
+def find_binaryninja() -> Optional[Path]:
+    if sys.platform == "linux" or sys.platform == "linux2":
+        # ok
+        logger.debug("detected OS: linux")
+    elif sys.platform == "darwin":
+        raise NotImplementedError(f"unsupported platform: {sys.platform}")
+    elif sys.platform == "win32":
+        raise NotImplementedError(f"unsupported platform: {sys.platform}")
+    else:
+        raise NotImplementedError(f"unsupported platform: {sys.platform}")
+
+    desktop_entry = get_desktop_entry("com.vector35.binaryninja.desktop")
+    if not desktop_entry:
+        return None
+    logger.debug("found Binary Ninja application: %s", desktop_entry)
+
+    binaryninja_path = get_binaryninja_path(desktop_entry)
+    if not binaryninja_path:
+        return None
+    logger.debug("found Binary Ninja installation: %s", binaryninja_path)
+
+    module_path = binaryninja_path / "python"
+    if not module_path.exists():
+        return None
+
+    if not (module_path / "binaryninja" / "__init__.py").exists():
+        return None
+
+    return module_path
+
+
+def is_binaryninja_installed() -> bool:
+    """Is the binaryninja module ready to import?"""
+    try:
+        return importlib.util.find_spec("binaryninja") is not None
+    except ModuleNotFoundError:
+        return False
+
+
+def has_binaryninja() -> bool:
+    if is_binaryninja_installed():
+        logger.debug("found installed Binary Ninja API")
+        return True
+
+    logger.debug("Binary Ninja API not installed, searching...")
+
+    binaryninja_path = find_binaryninja()
+    if not binaryninja_path:
+        logger.debug("failed to find Binary Ninja installation")
+
+    logger.debug("found Binary Ninja API: %s", binaryninja_path)
+    return binaryninja_path is not None
+
+
+def load_binaryninja() -> bool:
+    try:
+        import binaryninja
+
+        return True
+    except ImportError:
+        binaryninja_path = find_binaryninja()
+        if not binaryninja_path:
+            return False
+
+        sys.path.append(binaryninja_path.absolute().as_posix())
+        try:
+            import binaryninja  # noqa: F401 unused import
+
+            return True
+        except ImportError:
+            return False
+
+
+def is_vivisect_installed() -> bool:
+    try:
+        return importlib.util.find_spec("vivisect") is not None
+    except ModuleNotFoundError:
+        return False
+
+
+def load_vivisect() -> bool:
+    try:
+        import vivisect  # noqa: F401 unused import
+
+        return True
+    except ImportError:
+        return False
+
+
+def main():
+    logging.basicConfig(level=logging.INFO)
+
+    table = rich.table.Table()
+    table.add_column("backend")
+    table.add_column("already installed?")
+    table.add_column("found?")
+    table.add_column("loads?")
+
+    if True:
+        row = ["vivisect"]
+        if is_vivisect_installed():
+            row.append("True")
+            row.append("-")
+        else:
+            row.append("False")
+            row.append("False")
+
+        row.append(str(load_vivisect()))
+        table.add_row(*row)
+
+    if True:
+        row = ["Binary Ninja"]
+        if is_binaryninja_installed():
+            row.append("True")
+            row.append("-")
+        else:
+            row.append("False")
+            row.append(str(find_binaryninja() is not None))
+
+        row.append(str(load_binaryninja()))
+        table.add_row(*row)
+
+    if True:
+        row = ["IDA idalib"]
+        if is_idalib_installed():
+            row.append("True")
+            row.append("-")
+        else:
+            row.append("False")
+            row.append(str(find_idalib() is not None))
+
+        row.append(str(load_idalib()))
+        table.add_row(*row)
+
+    rich.print(table)
+
+
+if __name__ == "__main__":
+    main()
Author	SHA1	Message	Date
Willi Ballenthin	02b59301dd	add idalib backend	2024-09-20 10:47:21 +00:00
Willi Ballenthin	a8e52615a3	introduce script to detect 3P backends ref #2376	2024-09-20 09:03:46 +00:00