Compare commits

...

10 Commits

Author SHA1 Message Date
Willi Ballenthin
03ce40e781 initial attempt at library identification via known strings 2024-10-10 12:35:48 +00:00
Moritz
1f7f24c467 Merge pull request #2454 from mandiant/fix/ida9idalib
Fix IDA 9.0 / idalib
2024-10-09 18:04:23 +02:00
mr-tz
f2c329b768 rename ida to idapro module for IDA 9.0 2024-10-09 12:20:38 +00:00
mr-tz
22368fbe6f rename bin_search function 2024-10-09 12:13:11 +00:00
Moritz
6a12ab8598 Merge pull request #2450 from mandiant/dependabot/pip/rich-13.9.2
build(deps): bump rich from 13.8.0 to 13.9.2
2024-10-08 10:57:04 +02:00
dependabot[bot]
a4fdb0a3ef build(deps): bump rich from 13.8.0 to 13.9.2
Bumps [rich](https://github.com/Textualize/rich) from 13.8.0 to 13.9.2.
- [Release notes](https://github.com/Textualize/rich/releases)
- [Changelog](https://github.com/Textualize/rich/blob/master/CHANGELOG.md)
- [Commits](https://github.com/Textualize/rich/compare/v13.8.0...v13.9.2)

---
updated-dependencies:
- dependency-name: rich
  dependency-type: direct:production
  update-type: version-update:semver-minor
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-10-07 14:07:10 +00:00
Moritz
c7bb8b8e67 Update Node checkout Actions (#2446)
* Update setup Node Actions
2024-10-07 11:46:37 +02:00
Tamir K.
41c5194693 Fix/corrupted file architecture key error (#2444)
* Add try except clause
2024-10-06 08:46:16 +02:00
Moritz
8c8b67a6ea Merge pull request #2438 from mandiant/mr-tz-patch-2
Update build.yml
2024-10-04 14:22:45 +02:00
Moritz
f0cc0fb2b8 Update build.yml 2024-10-04 14:02:53 +02:00
36 changed files with 468 additions and 13 deletions

View File

@@ -114,7 +114,7 @@ jobs:
include:
- asset_name: linux
artifact_name: capa
- asset_name: linux-py311
- asset_name: linux-py312
artifact_name: capa
- asset_name: windows
artifact_name: capa.exe

View File

@@ -43,7 +43,7 @@ jobs:
fetch-depth: 1
show-progress: true
- name: Set up Node
uses: actions/setup-node@v4
uses: actions/setup-node@0a44ba7841725637a19e28fa30b79a866c81b0a6 # v4.0.4
with:
node-version: 20
cache: 'npm'

View File

@@ -19,7 +19,7 @@ jobs:
show-progress: true
- name: Set up Node
uses: actions/setup-node@v3
uses: actions/setup-node@0a44ba7841725637a19e28fa30b79a866c81b0a6 # v4.0.4
with:
node-version: 20
cache: 'npm'

View File

@@ -12,6 +12,9 @@
### Bug Fixes
- extractor: fix exception when PE extractor encounters unknown architecture #2440 @Tamir-K
- IDA Pro: rename ida to idapro module for plugin and idalib in IDA 9.0 #2453 @mr-tz
### capa Explorer Web
### capa Explorer IDA Pro plugin

View File

193
capa/analysis/libraries.py Normal file
View File

@@ -0,0 +1,193 @@
"""
further requirements:
- nltk
"""
import sys
import logging
import collections
from pathlib import Path
import rich
from rich.text import Text
import capa.analysis.strings
import capa.features.extractors.strings
from capa.analysis.strings import LibraryStringDatabase
logger = logging.getLogger(__name__)
def extract_strings(buf, n=4):
yield from capa.features.extractors.strings.extract_ascii_strings(buf, n=n)
yield from capa.features.extractors.strings.extract_unicode_strings(buf, n=n)
def prune_databases(dbs: list[LibraryStringDatabase], n=8):
"""remove less trustyworthy database entries.
such as:
- those found in multiple databases
- those that are English words
- those that are too short
- Windows API and DLL names
"""
# TODO: consider applying these filters directly to the persisted databases, not at load time.
winapi = capa.analysis.strings.WindowsApiStringDatabase.from_defaults()
try:
from nltk.corpus import words as nltk_words
except ImportError:
# one-time download of dataset.
# this probably doesn't work well for embedded use.
import nltk
nltk.download("words")
from nltk.corpus import words as nltk_words
words = set(nltk_words.words())
counter = collections.Counter()
to_remove = set()
for db in dbs:
for string in db.metadata_by_string.keys():
counter[string] += 1
if string in words:
to_remove.add(string)
continue
if len(string) < n:
to_remove.add(string)
continue
if string in winapi.api_names:
to_remove.add(string)
continue
if string in winapi.dll_names:
to_remove.add(string)
continue
for string, count in counter.most_common():
if count <= 1:
break
# remove strings that are seen in more than one database
to_remove.add(string)
for db in dbs:
for string in to_remove:
if string in db.metadata_by_string:
del db.metadata_by_string[string]
def open_ida(input_path: Path):
import tempfile
import idapro
t = Path(tempfile.mkdtemp(prefix="ida-")) / input_path.name
t.write_bytes(input_path.read_bytes())
# resource leak: we should delete this upon exit
idapro.enable_console_messages(False)
idapro.open_database(str(t.absolute()), run_auto_analysis=True)
import ida_auto
ida_auto.auto_wait()
def main():
logging.basicConfig(level=logging.DEBUG)
# use n=8 to ignore common words
N = 8
input_path = Path(sys.argv[1])
input_buf = input_path.read_bytes()
dbs = capa.analysis.strings.get_default_databases()
prune_databases(dbs, n=N)
strings_by_library = collections.defaultdict(set)
for string in extract_strings(input_path.read_bytes(), n=N):
for db in dbs:
if (metadata := db.metadata_by_string.get(string.s)):
strings_by_library[metadata.library_name].add(string.s)
console = rich.get_console()
console.print(f"found libraries:", style="bold")
for library, strings in sorted(strings_by_library.items(), key=lambda p: len(p[1]), reverse=True):
console.print(f" - [b]{library}[/] ({len(strings)} strings)")
for string in sorted(strings)[:10]:
console.print(f" - {string}", markup=False, style="grey37")
if len(strings) > 10:
console.print(" ...", style="grey37")
if not strings_by_library:
console.print(" (none)", style="grey37")
# since we're not going to find any strings
# return early and don't do IDA analysis
return
# TODO: ensure there are XXX matches for each library, or ignore those entries
open_ida(input_path)
import idaapi
import idautils
import ida_funcs
import capa.features.extractors.ida.helpers as ida_helpers
strings_by_function = collections.defaultdict(set)
for ea in idautils.Functions():
f = idaapi.get_func(ea)
# ignore library functions and thunk functions as identified by IDA
if f.flags & idaapi.FUNC_THUNK:
continue
if f.flags & idaapi.FUNC_LIB:
continue
for bb in ida_helpers.get_function_blocks(f):
for insn in ida_helpers.get_instructions_in_range(bb.start_ea, bb.end_ea):
ref = capa.features.extractors.ida.helpers.find_data_reference_from_insn(insn)
if ref == insn.ea:
continue
string = capa.features.extractors.ida.helpers.find_string_at(ref)
if not string:
continue
for db in dbs:
if (metadata := db.metadata_by_string.get(string)):
strings_by_function[ea].add(string)
# ensure there are at least XXX functions renamed, or ignore those entries
console.print("functions:", style="bold")
for function, strings in sorted(strings_by_function.items()):
if strings:
name = ida_funcs.get_func_name(function)
console.print(f" [b]{name}[/]@{function:08x}:")
for string in strings:
for db in dbs:
if (metadata := db.metadata_by_string.get(string)):
location = Text(f"{metadata.library_name}@{metadata.library_version}::{metadata.function_name}", style="grey37")
console.print(" - ", location, ": ", string.rstrip())
# TODO: ensure there aren't conflicts among the matches
console.print()
console.print(f"found {len(strings_by_function)} library functions across {len(list(idautils.Functions()))} functions")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,95 @@
import gzip
import pathlib
from typing import Dict, Sequence
from dataclasses import dataclass
import msgspec
class LibraryString(msgspec.Struct):
string: str
library_name: str
library_version: str
file_path: str | None = None
function_name: str | None = None
line_number: int | None = None
@dataclass
class LibraryStringDatabase:
metadata_by_string: Dict[str, LibraryString]
def __len__(self) -> int:
return len(self.metadata_by_string)
@classmethod
def from_file(cls, path: pathlib.Path) -> "LibraryStringDatabase":
metadata_by_string: Dict[str, LibraryString] = {}
decoder = msgspec.json.Decoder(type=LibraryString)
for line in gzip.decompress(path.read_bytes()).split(b"\n"):
if not line:
continue
s = decoder.decode(line)
metadata_by_string[s.string] = s
return cls(metadata_by_string=metadata_by_string)
DEFAULT_FILENAMES = (
"brotli.jsonl.gz",
"bzip2.jsonl.gz",
"cryptopp.jsonl.gz",
"curl.jsonl.gz",
"detours.jsonl.gz",
"jemalloc.jsonl.gz",
"jsoncpp.jsonl.gz",
"kcp.jsonl.gz",
"liblzma.jsonl.gz",
"libsodium.jsonl.gz",
"libpcap.jsonl.gz",
"mbedtls.jsonl.gz",
"openssl.jsonl.gz",
"sqlite3.jsonl.gz",
"tomcrypt.jsonl.gz",
"wolfssl.jsonl.gz",
"zlib.jsonl.gz",
)
DEFAULT_PATHS = tuple(
pathlib.Path(__file__).parent / "data" / "oss" / filename for filename in DEFAULT_FILENAMES
) + (pathlib.Path(__file__).parent / "data" / "crt" / "msvc_v143.jsonl.gz",)
def get_default_databases() -> Sequence[LibraryStringDatabase]:
return [LibraryStringDatabase.from_file(path) for path in DEFAULT_PATHS]
@dataclass
class WindowsApiStringDatabase:
dll_names: set[str]
api_names: set[str]
def __len__(self) -> int:
return len(self.dll_names) + len(self.api_names)
@classmethod
def from_dir(cls, path: pathlib.Path) -> "WindowsApiStringDatabase":
dll_names: Set[str] = set()
api_names: Set[str] = set()
for line in gzip.decompress((path / "dlls.txt.gz").read_bytes()).decode("utf-8").splitlines():
if not line:
continue
dll_names.add(line)
for line in gzip.decompress((path / "apis.txt.gz").read_bytes()).decode("utf-8").splitlines():
if not line:
continue
api_names.add(line)
return cls(dll_names=dll_names, api_names=api_names)
@classmethod
def from_defaults(cls) -> "WindowsApiStringDatabase":
return cls.from_dir(pathlib.Path(__file__).parent / "data" / "winapi")

Binary file not shown.

View File

@@ -0,0 +1,3 @@
*.csv
*.jsonl
*.jsonl.gz

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,52 @@
"""
convert from a jh CSV file to a .jsonl.gz OpenSourceString database.
the jh file looks like:
# triplet,compiler,library,version,profile,path,function,type,value
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0x00000100
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0xfffffff8
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0xfffffffe
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,api,BZ2_bzCompressInit
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,api,handle_compress
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0x0000fa90
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffff8
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffff9
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffDecompress,number,0xfffffffd
jh is found here: https://github.com/williballenthin/lancelot/blob/master/bin/src/bin/jh.rs
"""
import sys
import json
import pathlib
import msgspec
from capa.analysis.strings import LibraryString
p = pathlib.Path(sys.argv[1])
for line in p.read_text().split("\n"):
if not line:
continue
if line.startswith("#"):
continue
triplet, compiler, library, version, profile, path, function, rest = line.split(",", 7)
type, _, value = rest.partition(",")
if type != "string":
continue
if value.startswith('"'):
value = json.loads(value)
s = LibraryString(
string=value,
library_name=library,
library_version=version,
file_path=path,
function_name=function,
)
sys.stdout.buffer.write(msgspec.json.encode(s))
sys.stdout.buffer.write(b"\n")

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,99 @@
# Strings from Open Source libraries
This directory contains databases of strings extracted from open soure software.
capa uses these databases to ignore functions that are likely library code.
There is one file for each database. Each database is a gzip-compressed, JSONL (one JSON document per line) file.
The JSON document looks like this:
string: "1.0.8, 13-Jul-2019"
library_name: "bzip2"
library_version: "1.0.8#3"
file_path: "CMakeFiles/bz2.dir/bzlib.c.obj"
function_name: "BZ2_bzlibVersion"
line_number: null
The following databases were extracted via the vkpkg & jh technique:
- brotli 1.0.9#5
- bzip2 1.0.8#3
- cryptopp 8.7.0
- curl 7.86.0#1
- detours 4.0.1#7
- jemalloc 5.3.0#1
- jsoncpp 1.9.5
- kcp 1.7
- liblzma 5.2.5#6
- libsodium 1.0.18#8
- libpcap 1.10.1#3
- mbedtls 2.28.1
- openssl 3.0.7#1
- sqlite3 3.40.0#1
- tomcrypt 1.18.2#2
- wolfssl 5.5.0
- zlib 1.2.13
This code was originally developed in FLOSS and imported into capa.
## The vkpkg & jh technique
Major steps:
1. build static libraries via vcpkg
2. extract features via jh
3. convert to JSONL format with `jh_to_qs.py`
4. compress with gzip
### Build static libraries via vcpkg
[vcpkg](https://vcpkg.io/en/) is a free C/C++ package manager for acquiring and managing libraries.
We use it to easily build common open source libraries, like zlib.
Use the triplet `x64-windows-static` to build static archives (.lib files that are AR archives containing COFF object files):
```console
PS > C:\vcpkg\vcpkg.exe install --triplet x64-windows-static zlib
```
### Extract features via jh
[jh](https://github.com/williballenthin/lancelot/blob/master/bin/src/bin/jh.rs)
is a lancelot-based utility that parses AR archives containing COFF object files,
reconstructs their control flow, finds functions, and extracts features.
jh extracts numbers, API calls, and strings; we are only interested in the string features.
For each feature, jh emits a CSV line with the fields
- target triplet
- compiler
- library
- version
- build profile
- path
- function
- feature type
- feature value
For example:
```csv
x64-windows-static,msvc143,bzip2,1.0.8#3,release,CMakeFiles/bz2.dir/bzlib.c.obj,BZ2_bzBuffToBuffCompress,number,0x00000100
```
For example, to invoke jh:
```console
$ ~/lancelot/target/release/jh x64-windows-static msvc143 zlib 1.2.13 release /mnt/c/vcpkg/installed/x64-windows-static/lib/zlib.lib > ~/flare-floss/floss/qs/db/data/oss/zlib.csv
```
### Convert to OSS database format
We use the script `jh_to_qs.py` to convert these CSV lines into JSONL file prepared for FLOSS:
```console
$ python3 jh_to_qs.py zlib.csv > zlib.jsonl
```
These files are then gzip'd:
```console
$ gzip -c zlib.jsonl > zlib.jsonl.gz
```

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -41,7 +41,7 @@ if hasattr(ida_bytes, "parse_binpat_str"):
return
while True:
ea, _ = ida_bytes.bin_search3(start, end, patterns, ida_bytes.BIN_SEARCH_FORWARD)
ea, _ = ida_bytes.bin_search(start, end, patterns, ida_bytes.BIN_SEARCH_FORWARD)
if ea == idaapi.BADADDR:
break
start = ea + 1

View File

@@ -18,7 +18,7 @@ logger = logging.getLogger(__name__)
def is_idalib_installed() -> bool:
try:
return importlib.util.find_spec("ida") is not None
return importlib.util.find_spec("idapro") is not None
except ModuleNotFoundError:
return False
@@ -44,6 +44,7 @@ def get_idalib_user_config_path() -> Optional[Path]:
def find_idalib() -> Optional[Path]:
config_path = get_idalib_user_config_path()
if not config_path:
logger.error("IDA Pro user configuration does not exist, please make sure you've installed idalib properly.")
return None
config = json.loads(config_path.read_text(encoding="utf-8"))
@@ -51,6 +52,9 @@ def find_idalib() -> Optional[Path]:
try:
ida_install_dir = Path(config["Paths"]["ida-install-dir"])
except KeyError:
logger.error(
"IDA Pro user configuration does not contain location of IDA Pro installation, please make sure you've installed idalib properly."
)
return None
if not ida_install_dir.exists():
@@ -73,7 +77,7 @@ def find_idalib() -> Optional[Path]:
if not idalib_path.exists():
return None
if not (idalib_path / "ida" / "__init__.py").is_file():
if not (idalib_path / "idapro" / "__init__.py").is_file():
return None
return idalib_path
@@ -96,7 +100,7 @@ def has_idalib() -> bool:
def load_idalib() -> bool:
try:
import ida
import idapro
return True
except ImportError:
@@ -106,7 +110,7 @@ def load_idalib() -> bool:
sys.path.append(idalib_path.absolute().as_posix())
try:
import ida # noqa: F401 unused import
import idapro # noqa: F401 unused import
return True
except ImportError:

View File

@@ -130,7 +130,13 @@ def extract_file_arch(pe, **kwargs):
elif pe.FILE_HEADER.Machine == pefile.MACHINE_TYPE["IMAGE_FILE_MACHINE_AMD64"]:
yield Arch(ARCH_AMD64), NO_ADDRESS
else:
logger.warning("unsupported architecture: %s", pefile.MACHINE_TYPE[pe.FILE_HEADER.Machine])
try:
logger.warning(
"unsupported architecture: %s",
pefile.MACHINE_TYPE[pe.FILE_HEADER.Machine],
)
except KeyError:
logger.warning("unknown architecture: %s", pe.FILE_HEADER.Machine)
def extract_file_features(pe, buf):

View File

@@ -323,7 +323,7 @@ def get_extractor(
if not idalib.load_idalib():
raise RuntimeError("failed to load IDA idalib module.")
import ida
import idapro
import ida_auto
import capa.features.extractors.ida.extractor
@@ -333,7 +333,7 @@ def get_extractor(
# so as not to screw up structured output.
with capa.helpers.stdout_redirector(io.BytesIO()):
with console.status("analyzing program...", spinner="dots"):
if ida.open_database(str(input_path), run_auto_analysis=True):
if idapro.open_database(str(input_path), run_auto_analysis=True):
raise RuntimeError("failed to analyze input file")
logger.debug("idalib: waiting for analysis...")

View File

@@ -177,7 +177,7 @@ known_first_party = [
"binaryninja",
"flirt",
"ghidra",
"ida",
"idapro",
"ida_ida",
"ida_auto",
"ida_bytes",

View File

@@ -36,7 +36,7 @@ pyelftools==0.31
pygments==2.18.0
python-flirt==0.8.10
pyyaml==6.0.2
rich==13.8.0
rich==13.9.2
ruamel-yaml==0.18.6
ruamel-yaml-clib==0.2.8
setuptools==75.1.0