cape: use pydantic model

This commit is contained in:
Willi Ballenthin
2023-08-16 11:12:05 +00:00
committed by GitHub
parent e943a71dff
commit 6f7bf96776
6 changed files with 141 additions and 170 deletions

View File

@@ -7,29 +7,24 @@
# See the License for the specific language governing permissions and limitations under the License. # See the License for the specific language governing permissions and limitations under the License.
import logging import logging
from typing import Any, Dict, List, Tuple, Iterator from typing import Tuple, Iterator
import capa.features.extractors.cape.file from capa.helpers import assert_never
import capa.features.extractors.cape.thread
import capa.features.extractors.cape.global_
import capa.features.extractors.cape.process
from capa.features.insn import API, Number from capa.features.insn import API, Number
from capa.features.common import String, Feature from capa.features.common import String, Feature
from capa.features.address import Address from capa.features.address import Address
from capa.features.extractors.cape.models import Call
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def extract_call_features( def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
behavior: Dict, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
) -> Iterator[Tuple[Feature, Address]]:
""" """
this method extrcts the given call's features (api name and arguments), this method extrcts the given call's features (such as API name and arguments),
and returns them as API, Number, and String features. and returns them as API, Number, and String features.
args: args:
behavior: a dictionary of behavioral artifacts extracted by the sandbox
ph: process handle (for defining the extraction scope) ph: process handle (for defining the extraction scope)
th: thread handle (for defining the extraction scope) th: thread handle (for defining the extraction scope)
ch: call handle (for defining the extraction scope) ch: call handle (for defining the extraction scope)
@@ -37,27 +32,29 @@ def extract_call_features(
yields: yields:
Feature, address; where Feature is either: API, Number, or String. Feature, address; where Feature is either: API, Number, or String.
""" """
# TODO(yelhamer): find correct base address used at runtime. call: Call = ch.inner
# this address may vary from the PE header, may read actual base from procdump.pe.imagebase or similar.
# https://github.com/mandiant/capa/issues/1618
process = capa.features.extractors.cape.helpers.find_process(behavior["processes"], ph)
calls: List[Dict[str, Any]] = process["calls"]
call = calls[ch.address.id]
assert call["thread_id"] == str(th.address.tid)
# list similar to disassembly: arguments right-to-left, call # list similar to disassembly: arguments right-to-left, call
for arg in call["arguments"][::-1]: for arg in reversed(call.arguments):
try: if isinstance(arg, list) and len(arg) == 0:
yield Number(int(arg["value"], 16)), ch.address # unsure why CAPE captures arguments as empty lists?
except ValueError: continue
yield String(arg["value"]), ch.address
yield API(call["api"]), ch.address elif isinstance(arg, str):
yield String(arg), ch.address
elif isinstance(arg, int):
yield Number(arg), ch.address
else:
assert_never(arg)
yield API(call.api), ch.address
def extract_features( def extract_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> Iterator[Tuple[Feature, Address]]:
behavior: Dict, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
) -> Iterator[Tuple[Feature, Address]]:
for handler in CALL_HANDLERS: for handler in CALL_HANDLERS:
for feature, addr in handler(behavior, ph, th, ch): for feature, addr in handler(ph, th, ch):
yield feature, addr yield feature, addr

View File

@@ -14,8 +14,10 @@ import capa.features.extractors.cape.file
import capa.features.extractors.cape.thread import capa.features.extractors.cape.thread
import capa.features.extractors.cape.global_ import capa.features.extractors.cape.global_
import capa.features.extractors.cape.process import capa.features.extractors.cape.process
from capa.features.common import Feature from capa.exceptions import UnsupportedFormatError
from capa.features.address import Address, AbsoluteVirtualAddress, _NoAddress from capa.features.common import Feature, Characteristic
from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, _NoAddress
from capa.features.extractors.cape.models import CapeReport
from capa.features.extractors.base_extractor import ( from capa.features.extractors.base_extractor import (
CallHandle, CallHandle,
SampleHashes, SampleHashes,
@@ -26,26 +28,26 @@ from capa.features.extractors.base_extractor import (
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
TESTED_VERSIONS = ("2.2-CAPE",) TESTED_VERSIONS = {"2.2-CAPE", "2.4-CAPE"}
class CapeExtractor(DynamicFeatureExtractor): class CapeExtractor(DynamicFeatureExtractor):
def __init__(self, cape_version: str, static: Dict, behavior: Dict): def __init__(self, report: CapeReport):
super().__init__() super().__init__()
self.cape_version = cape_version self.report: CapeReport = report
self.static = static
self.behavior = behavior
self.sample_hashes = SampleHashes( self.sample_hashes = SampleHashes(
md5=static["file"]["md5"].lower(), md5=self.report.target.file.md5.lower(),
sha1=static["file"]["sha1"].lower(), sha1=self.report.target.file.sha1.lower(),
sha256=static["file"]["sha256"].lower(), sha256=self.report.target.file.sha256.lower(),
) )
self.global_features = capa.features.extractors.cape.global_.extract_features(self.static) self.global_features = capa.features.extractors.cape.global_.extract_features(self.report)
def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]: def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]:
# value according to the PE header, the actual trace may use a different imagebase # value according to the PE header, the actual trace may use a different imagebase
return AbsoluteVirtualAddress(self.static["pe"]["imagebase"]) assert self.report.static is not None and self.report.static.pe is not None
return AbsoluteVirtualAddress(self.report.static.pe.imagebase)
def get_sample_hashes(self) -> SampleHashes: def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes return self.sample_hashes
@@ -54,44 +56,43 @@ class CapeExtractor(DynamicFeatureExtractor):
yield from self.global_features yield from self.global_features
def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.cape.file.extract_features(self.static) yield from capa.features.extractors.cape.file.extract_features(self.report)
def get_processes(self) -> Iterator[ProcessHandle]: def get_processes(self) -> Iterator[ProcessHandle]:
yield from capa.features.extractors.cape.file.get_processes(self.behavior) yield from capa.features.extractors.cape.file.get_processes(self.report)
def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: def extract_process_features(self, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.cape.process.extract_features(self.behavior, ph) yield from capa.features.extractors.cape.process.extract_features(ph)
def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
yield from capa.features.extractors.cape.process.get_threads(self.behavior, ph) yield from capa.features.extractors.cape.process.get_threads(ph)
def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.cape.thread.extract_features(self.behavior, ph, th) if False:
# force this routine to be a generator,
# but we don't actually have any elements to generate.
yield Characteristic("never"), NO_ADDRESS
return
def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]: def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
yield from capa.features.extractors.cape.thread.get_calls(self.behavior, ph, th) yield from capa.features.extractors.cape.thread.get_calls(ph, th)
def extract_call_features( def extract_call_features(
self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
) -> Iterator[Tuple[Feature, Address]]: ) -> Iterator[Tuple[Feature, Address]]:
yield from capa.features.extractors.cape.call.extract_features(self.behavior, ph, th, ch) yield from capa.features.extractors.cape.call.extract_features(ph, th, ch)
@classmethod @classmethod
def from_report(cls, report: Dict) -> "CapeExtractor": def from_report(cls, report: Dict) -> "CapeExtractor":
cape_version = report["info"]["version"] cr = CapeReport.model_validate(report)
if cape_version not in TESTED_VERSIONS:
logger.warning("CAPE version '%s' not tested/supported yet", cape_version)
static = report["static"] if cr.info.version not in TESTED_VERSIONS:
format_ = list(static.keys())[0] logger.warning("CAPE version '%s' not tested/supported yet", cr.info.version)
static = static[format_]
static.update(report["behavior"].pop("summary"))
static.update(report["target"])
static.update({"processtree": report["behavior"]["processtree"]})
static.update({"strings": report["strings"]})
static.update({"format": format_})
behavior = report.pop("behavior") if cr.static is None:
behavior["network"] = report.pop("network") raise UnsupportedFormatError("CAPE report missing static analysis")
return cls(cape_version, static, behavior) if cr.static.pe is None:
raise UnsupportedFormatError("CAPE report missing static analysis")
return cls(cr)

View File

@@ -7,106 +7,98 @@
# See the License for the specific language governing permissions and limitations under the License. # See the License for the specific language governing permissions and limitations under the License.
import logging import logging
from typing import Dict, Tuple, Iterator from typing import Tuple, Iterator
from capa.features.file import Export, Import, Section from capa.features.file import Export, Import, Section
from capa.features.common import String, Feature from capa.features.common import String, Feature
from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress from capa.features.address import NO_ADDRESS, Address, ProcessAddress, AbsoluteVirtualAddress
from capa.features.extractors.helpers import generate_symbols from capa.features.extractors.helpers import generate_symbols
from capa.features.extractors.cape.models import CapeReport
from capa.features.extractors.base_extractor import ProcessHandle from capa.features.extractors.base_extractor import ProcessHandle
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_processes(static: Dict) -> Iterator[ProcessHandle]: def get_processes(report: CapeReport) -> Iterator[ProcessHandle]:
""" """
get all the created processes for a sample get all the created processes for a sample
""" """
for process in report.behavior.processes:
def rec(process): addr = ProcessAddress(pid=process.process_id, ppid=process.parent_id)
address: ProcessAddress = ProcessAddress(pid=process["pid"], ppid=process["parent_id"]) yield ProcessHandle(address=addr, inner=process)
inner: Dict[str, str] = {"name": process["name"]}
yield ProcessHandle(address=address, inner=inner)
for child in process["children"]:
yield from rec(child)
for process in static["processtree"]:
yield from rec(process)
def extract_import_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_import_names(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
""" """
extract imported function names extract imported function names
""" """
imports = static["imports"] assert report.static is not None and report.static.pe is not None
imports = report.static.pe.imports
"""
2.2-CAPE
"imports": [
{
"dll": "RPCRT4.dll",
"imports": [{"address": "0x40504c","name": "NdrSimpleTypeUnmarshall"}, ...]
},
...
]
2.4-CAPE
"imports": {
"ADVAPI32": {
"dll": "ADVAPI32.dll",
"imports": [{"address": "0x522000", "name": "OpenSCManagerA"}, ...],
...
},
...
}
"""
if isinstance(imports, dict): if isinstance(imports, dict):
imports = imports.values() imports = list(imports.values())
assert isinstance(imports, list)
for library in imports: for library in imports:
for function in library["imports"]: for function in library.imports:
addr = int(function["address"], 16) for name in generate_symbols(library.dll, function.name):
for name in generate_symbols(library["dll"], function["name"]): yield Import(name), AbsoluteVirtualAddress(function.address)
yield Import(name), AbsoluteVirtualAddress(addr)
def extract_export_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_export_names(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for function in static["exports"]: assert report.static is not None and report.static.pe is not None
name, address = function["name"], int(function["address"], 16) for function in report.static.pe.exports:
yield Export(name), AbsoluteVirtualAddress(address) yield Export(function.name), AbsoluteVirtualAddress(function.address)
def extract_section_names(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_section_names(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
# be consistent with static extractors and use section VA assert report.static is not None and report.static.pe is not None
base = int(static["imagebase"], 16) for section in report.static.pe.sections:
for section in static["sections"]: yield Section(section.name), AbsoluteVirtualAddress(section.virtual_address)
name, address = section["name"], int(section["virtual_address"], 16)
yield Section(name), AbsoluteVirtualAddress(base + address)
def extract_file_strings(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_file_strings(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for string_ in static["strings"]: if report.strings is not None:
yield String(string_), NO_ADDRESS for string in report.strings:
yield String(string), NO_ADDRESS
def extract_used_regkeys(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_used_regkeys(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for regkey in static["keys"]: for regkey in report.behavior.summary.keys:
yield String(regkey), NO_ADDRESS yield String(regkey), NO_ADDRESS
def extract_used_files(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_used_files(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for filename in static["files"]: for file in report.behavior.summary.files:
yield String(filename), NO_ADDRESS yield String(file), NO_ADDRESS
def extract_used_mutexes(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_used_mutexes(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for mutex in static["mutexes"]: for mutex in report.behavior.summary.mutexes:
yield String(mutex), NO_ADDRESS yield String(mutex), NO_ADDRESS
def extract_features(static: Dict) -> Iterator[Tuple[Feature, Address]]: def extract_used_commands(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for cmd in report.behavior.summary.executed_commands:
yield String(cmd), NO_ADDRESS
def extract_used_apis(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for symbol in report.behavior.summary.resolved_apis:
yield String(symbol), NO_ADDRESS
def extract_used_services(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for svc in report.behavior.summary.created_services:
yield String(svc), NO_ADDRESS
for svc in report.behavior.summary.started_services:
yield String(svc), NO_ADDRESS
def extract_features(report: CapeReport) -> Iterator[Tuple[Feature, Address]]:
for handler in FILE_HANDLERS: for handler in FILE_HANDLERS:
for feature, addr in handler(static): for feature, addr in handler(report):
yield feature, addr yield feature, addr
@@ -118,4 +110,6 @@ FILE_HANDLERS = (
extract_used_regkeys, extract_used_regkeys,
extract_used_files, extract_used_files,
extract_used_mutexes, extract_used_mutexes,
extract_used_apis,
extract_used_services,
) )

View File

@@ -63,6 +63,10 @@ EmptyDict: TypeAlias = BaseModel
EmptyList: TypeAlias = List[Any] EmptyList: TypeAlias = List[Any]
class Info(FlexibleModel):
version: str
class ImportedSymbol(ExactModel): class ImportedSymbol(ExactModel):
address: HexInt address: HexInt
name: str name: str
@@ -251,7 +255,7 @@ class ProcessFile(File):
class Argument(ExactModel): class Argument(ExactModel):
name: str name: str
# unsure why empty list is provided here # unsure why empty list is provided here
value: Union[HexInt, str, EmptyList] value: Union[HexInt, int, str, EmptyList]
pretty_value: Optional[str] = None pretty_value: Optional[str] = None
@@ -359,6 +363,8 @@ class CAPE(ExactModel):
class CapeReport(FlexibleModel): class CapeReport(FlexibleModel):
# the input file, I think # the input file, I think
target: Target target: Target
# info about the processing job, like machine and distributed metadata.
info: Info
# #
# static analysis results # static analysis results
@@ -397,8 +403,6 @@ class CapeReport(FlexibleModel):
# screenshot hash values # screenshot hash values
deduplicated_shots: Skip = None deduplicated_shots: Skip = None
# info about the processing job, like machine and distributed metadata.
info: Skip = None
# k-v pairs describing the time it took to run each stage. # k-v pairs describing the time it took to run each stage.
statistics: Skip = None statistics: Skip = None
# k-v pairs of ATT&CK ID to signature name or similar. # k-v pairs of ATT&CK ID to signature name or similar.

View File

@@ -7,50 +7,41 @@
# See the License for the specific language governing permissions and limitations under the License. # See the License for the specific language governing permissions and limitations under the License.
import logging import logging
from typing import Dict, List, Tuple, Iterator from typing import List, Tuple, Iterator
import capa.features.extractors.cape.file
import capa.features.extractors.cape.thread
import capa.features.extractors.cape.global_
import capa.features.extractors.cape.process
from capa.features.common import String, Feature from capa.features.common import String, Feature
from capa.features.address import Address, ThreadAddress from capa.features.address import Address, ThreadAddress
from capa.features.extractors.cape.models import Process
from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle from capa.features.extractors.base_extractor import ThreadHandle, ProcessHandle
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_threads(behavior: Dict, ph: ProcessHandle) -> Iterator[ThreadHandle]: def get_threads(ph: ProcessHandle) -> Iterator[ThreadHandle]:
""" """
get the threads associated with a given process get the threads associated with a given process
""" """
process: Process = ph.inner
process = capa.features.extractors.cape.helpers.find_process(behavior["processes"], ph) threads: List[int] = process.threads
threads: List = process["threads"]
for thread in threads: for thread in threads:
address: ThreadAddress = ThreadAddress(process=ph.address, tid=int(thread)) address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread)
yield ThreadHandle(address=address, inner={}) yield ThreadHandle(address=address, inner={})
def extract_environ_strings(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: def extract_environ_strings(ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
""" """
extract strings from a process' provided environment variables. extract strings from a process' provided environment variables.
""" """
process: Process = ph.inner
process = capa.features.extractors.cape.helpers.find_process(behavior["processes"], ph) for value in (value for value in process.environ.values() if value):
environ: Dict[str, str] = process["environ"]
if not environ:
return
for value in (value for value in environ.values() if value):
yield String(value), ph.address yield String(value), ph.address
def extract_features(behavior: Dict, ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]: def extract_features(ph: ProcessHandle) -> Iterator[Tuple[Feature, Address]]:
for handler in PROCESS_HANDLERS: for handler in PROCESS_HANDLERS:
for feature, addr in handler(behavior, ph): for feature, addr in handler(ph):
yield feature, addr yield feature, addr

View File

@@ -7,38 +7,22 @@
# See the License for the specific language governing permissions and limitations under the License. # See the License for the specific language governing permissions and limitations under the License.
import logging import logging
from typing import Any, Dict, List, Tuple, Iterator from typing import Iterator
import capa.features.extractors.cape.helpers from capa.features.address import DynamicCallAddress
from capa.features.common import Feature from capa.features.extractors.cape.models import Process
from capa.features.address import NO_ADDRESS, Address, DynamicCallAddress
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def get_calls(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]: def get_calls(ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
process = capa.features.extractors.cape.helpers.find_process(behavior["processes"], ph) process: Process = ph.inner
calls: List[Dict[str, Any]] = process["calls"]
tid = str(th.address.tid) tid = th.address.tid
for call in calls: for call_index, call in enumerate(process.calls):
if call["thread_id"] != tid: if call.thread_id != tid:
continue continue
addr = DynamicCallAddress(thread=th.address, id=call["id"]) addr = DynamicCallAddress(thread=th.address, id=call_index)
ch = CallHandle(address=addr, inner={}) yield CallHandle(address=addr, inner=call)
yield ch
def extract_thread_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
yield from ((Feature(0), NO_ADDRESS),)
def extract_features(behavior: Dict, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]:
for handler in THREAD_HANDLERS:
for feature, addr in handler(behavior, ph, th):
yield feature, addr
THREAD_HANDLERS = (extract_thread_features,)