Merge branch 'dynamic-feature-extraction' into fix/dynamic-proto

This commit is contained in:
Willi Ballenthin
2023-09-05 08:18:51 +00:00
14 changed files with 86 additions and 86 deletions

1
.gitmodules vendored
View File

@@ -1,6 +1,7 @@
[submodule "rules"]
path = rules
url = ../capa-rules.git
branch = dynamic-syntax
[submodule "tests/data"]
path = tests/data
url = ../capa-testfiles.git

View File

@@ -304,7 +304,7 @@ def match(rules: List["capa.rules.Rule"], features: FeatureSet, addr: Address) -
other strategies can be imagined that match differently; implement these elsewhere.
specifically, this routine does "top down" matching of the given rules against the feature set.
"""
results = collections.defaultdict(list) # type: MatchResults
results: MatchResults = collections.defaultdict(list)
# copy features so that we can modify it
# without affecting the caller (keep this function pure)

View File

@@ -412,8 +412,6 @@ class DynamicFeatureExtractor:
"""
Yields all the features of a process. These include:
- file features of the process' image
- inter-process injection
- detected dynamic DLL loading
"""
raise NotImplementedError()
@@ -429,8 +427,6 @@ class DynamicFeatureExtractor:
"""
Yields all the features of a thread. These include:
- sequenced api traces
- file/registry interactions
- network activity
"""
raise NotImplementedError()

View File

@@ -6,7 +6,6 @@
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
from typing import List, Tuple, Iterator
from pathlib import Path
import binaryninja as binja
@@ -29,7 +28,7 @@ from capa.features.extractors.base_extractor import (
class BinjaFeatureExtractor(StaticFeatureExtractor):
def __init__(self, bv: binja.BinaryView):
super().__init__(hashes=SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes()))
super().__init__(hashes=SampleHashes.from_bytes(bv.file.raw.read(0, len(bv.file.raw))))
self.bv = bv
self.global_features: List[Tuple[Feature, Address]] = []
self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv))

View File

@@ -55,8 +55,8 @@ def generate_symbols(dll: str, symbol: str) -> Iterator[str]:
dll = dll.lower()
# trim extensions observed in dynamic traces
dll = dll.replace(".dll", "")
dll = dll.replace(".drv", "")
dll = dll[0:-4] if dll.endswith(".dll") else dll
dll = dll[0:-4] if dll.endswith(".drv") else dll
# kernel32.CreateFileA
yield f"{dll}.{symbol}"

View File

@@ -59,6 +59,9 @@ class NullStaticFeatureExtractor(StaticFeatureExtractor):
def get_base_address(self):
return self.base_address
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_global_features(self):
for feature in self.global_features:
yield feature, NO_ADDRESS
@@ -121,6 +124,9 @@ class NullDynamicFeatureExtractor(DynamicFeatureExtractor):
for feature in self.global_features:
yield feature, NO_ADDRESS
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_file_features(self):
for address, feature in self.file_features:
yield feature, address

View File

@@ -53,7 +53,6 @@ class AddressType(str, Enum):
PROCESS = "process"
THREAD = "thread"
CALL = "call"
DYNAMIC = "dynamic"
NO_ADDRESS = "no address"
@@ -424,7 +423,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
# Mypy is unable to recognise `global_` as a argument due to alias
freeze = Freeze(
version=2,
version=3,
base_address=Address.from_capa(extractor.get_base_address()),
sample_hashes=extractor.get_sample_hashes(),
extractor=Extractor(name=extractor.__class__.__name__),
@@ -528,7 +527,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
base_addr = get_base_addr() if get_base_addr else capa.features.address.NO_ADDRESS
freeze = Freeze(
version=2,
version=3,
base_address=Address.from_capa(base_addr),
sample_hashes=extractor.get_sample_hashes(),
extractor=Extractor(name=extractor.__class__.__name__),
@@ -542,7 +541,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
def loads_static(s: str) -> StaticFeatureExtractor:
"""deserialize a set of features (as a NullStaticFeatureExtractor) from a string."""
freeze = Freeze.model_validate_json(s)
if freeze.version != 2:
if freeze.version != 3:
raise ValueError(f"unsupported freeze format version: {freeze.version}")
assert isinstance(freeze.features, StaticFeatures)
@@ -575,7 +574,7 @@ def loads_static(s: str) -> StaticFeatureExtractor:
def loads_dynamic(s: str) -> DynamicFeatureExtractor:
"""deserialize a set of features (as a NullDynamicFeatureExtractor) from a string."""
freeze = Freeze.parse_raw(s)
if freeze.version != 2:
if freeze.version != 3:
raise ValueError(f"unsupported freeze format version: {freeze.version}")
assert isinstance(freeze.features, DynamicFeatures)
@@ -624,11 +623,11 @@ def is_freeze(buf: bytes) -> bool:
return buf[: len(MAGIC)] == MAGIC
def is_static(buf: bytes) -> bool:
def is_static_freeze(buf: bytes) -> bool:
return buf[: len(STATIC_MAGIC)] == STATIC_MAGIC
def is_dynamic(buf: bytes) -> bool:
def is_dynamic_freeze(buf: bytes) -> bool:
return buf[: len(DYNAMIC_MAGIC)] == DYNAMIC_MAGIC
@@ -636,9 +635,9 @@ def load(buf: bytes):
"""deserialize a set of features (as a NullFeatureExtractor) from a byte array."""
if not is_freeze(buf):
raise ValueError("missing magic header")
if is_static(buf):
if is_static_freeze(buf):
return loads_static(zlib.decompress(buf[len(STATIC_MAGIC) :]).decode("utf-8"))
elif is_dynamic(buf):
elif is_dynamic_freeze(buf):
return loads_dynamic(zlib.decompress(buf[len(DYNAMIC_MAGIC) :]).decode("utf-8"))
else:
raise ValueError("invalid magic header")

View File

@@ -145,7 +145,7 @@ def find_instruction_capabilities(
returns: tuple containing (features for instruction, match results for instruction)
"""
# all features found for the instruction.
features = collections.defaultdict(set) # type: FeatureSet
features: FeatureSet = collections.defaultdict(set)
for feature, addr in itertools.chain(
extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features()
@@ -173,11 +173,11 @@ def find_basic_block_capabilities(
"""
# all features found within this basic block,
# includes features found within instructions.
features = collections.defaultdict(set) # type: FeatureSet
features: FeatureSet = collections.defaultdict(set)
# matches found at the instruction scope.
# might be found at different instructions, thats ok.
insn_matches = collections.defaultdict(list) # type: MatchResults
insn_matches: MatchResults = collections.defaultdict(list)
for insn in extractor.get_instructions(f, bb):
ifeatures, imatches = find_instruction_capabilities(ruleset, extractor, f, bb, insn)
@@ -213,15 +213,15 @@ def find_code_capabilities(
"""
# all features found within this function,
# includes features found within basic blocks (and instructions).
function_features = collections.defaultdict(set) # type: FeatureSet
function_features: FeatureSet = collections.defaultdict(set)
# matches found at the basic block scope.
# might be found at different basic blocks, thats ok.
bb_matches = collections.defaultdict(list) # type: MatchResults
bb_matches: MatchResults = collections.defaultdict(list)
# matches found at the instruction scope.
# might be found at different instructions, thats ok.
insn_matches = collections.defaultdict(list) # type: MatchResults
insn_matches: MatchResults = collections.defaultdict(list)
for bb in extractor.get_basic_blocks(fh):
features, bmatches, imatches = find_basic_block_capabilities(ruleset, extractor, fh, bb)
@@ -242,7 +242,7 @@ def find_code_capabilities(
def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet):
file_features = collections.defaultdict(set) # type: FeatureSet
file_features: FeatureSet = collections.defaultdict(set)
for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()):
# not all file features may have virtual addresses.
@@ -265,9 +265,9 @@ def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, functi
def find_static_capabilities(
ruleset: RuleSet, extractor: StaticFeatureExtractor, disable_progress=None
) -> Tuple[MatchResults, Any]:
all_function_matches = collections.defaultdict(list) # type: MatchResults
all_bb_matches = collections.defaultdict(list) # type: MatchResults
all_insn_matches = collections.defaultdict(list) # type: MatchResults
all_function_matches: MatchResults = collections.defaultdict(list)
all_bb_matches: MatchResults = collections.defaultdict(list)
all_insn_matches: MatchResults = collections.defaultdict(list)
feature_counts = rdoc.StaticFeatureCounts(file=0, functions=())
library_functions: Tuple[rdoc.LibraryFunction, ...] = ()
@@ -328,7 +328,7 @@ def find_static_capabilities(
# collection of features that captures the rule matches within function, BB, and instruction scopes.
# mapping from feature (matched rule) to set of addresses at which it matched.
function_and_lower_features = collections.defaultdict(set) # type: FeatureSet
function_and_lower_features: FeatureSet = collections.defaultdict(set)
for rule_name, results in itertools.chain(
all_function_matches.items(), all_bb_matches.items(), all_insn_matches.items()
):
@@ -368,7 +368,7 @@ def find_call_capabilities(
returns: tuple containing (features for call, match results for call)
"""
# all features found for the call.
features = collections.defaultdict(set) # type: FeatureSet
features: FeatureSet = collections.defaultdict(set)
for feature, addr in itertools.chain(
extractor.extract_call_features(ph, th, ch), extractor.extract_global_features()
@@ -396,11 +396,11 @@ def find_thread_capabilities(
"""
# all features found within this thread,
# includes features found within calls.
features = collections.defaultdict(set) # type: FeatureSet
features: FeatureSet = collections.defaultdict(set)
# matches found at the call scope.
# might be found at different calls, thats ok.
call_matches = collections.defaultdict(list) # type: MatchResults
call_matches: MatchResults = collections.defaultdict(list)
for ch in extractor.get_calls(ph, th):
ifeatures, imatches = find_call_capabilities(ruleset, extractor, ph, th, ch)
@@ -434,15 +434,15 @@ def find_process_capabilities(
"""
# all features found within this process,
# includes features found within threads (and calls).
process_features = collections.defaultdict(set) # type: FeatureSet
process_features: FeatureSet = collections.defaultdict(set)
# matches found at the basic threads.
# might be found at different threads, thats ok.
thread_matches = collections.defaultdict(list) # type: MatchResults
thread_matches: MatchResults = collections.defaultdict(list)
# matches found at the call scope.
# might be found at different calls, thats ok.
call_matches = collections.defaultdict(list) # type: MatchResults
call_matches: MatchResults = collections.defaultdict(list)
for th in extractor.get_threads(ph):
features, tmatches, cmatches = find_thread_capabilities(ruleset, extractor, ph, th)
@@ -465,9 +465,9 @@ def find_process_capabilities(
def find_dynamic_capabilities(
ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress=None
) -> Tuple[MatchResults, Any]:
all_process_matches = collections.defaultdict(list) # type: MatchResults
all_thread_matches = collections.defaultdict(list) # type: MatchResults
all_call_matches = collections.defaultdict(list) # type: MatchResults
all_process_matches: MatchResults = collections.defaultdict(list)
all_thread_matches: MatchResults = collections.defaultdict(list)
all_call_matches: MatchResults = collections.defaultdict(list)
feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())
@@ -502,7 +502,7 @@ def find_dynamic_capabilities(
# collection of features that captures the rule matches within process and thread scopes.
# mapping from feature (matched rule) to set of addresses at which it matched.
process_and_lower_features = collections.defaultdict(set) # type: FeatureSet
process_and_lower_features: FeatureSet = collections.defaultdict(set)
for rule_name, results in itertools.chain(
all_process_matches.items(), all_thread_matches.items(), all_call_matches.items()
):
@@ -902,7 +902,7 @@ def get_rules(
if ruleset is not None:
return ruleset
rules = [] # type: List[Rule]
rules: List[Rule] = []
total_rule_count = len(rule_file_paths)
for i, (path, content) in enumerate(zip(rule_file_paths, rule_contents)):
@@ -1021,7 +1021,7 @@ def collect_metadata(
md5=md5,
sha1=sha1,
sha256=sha256,
path=str(Path(sample_path).resolve()),
path=Path(sample_path).resolve().as_posix(),
),
flavor=flavor,
analysis=get_sample_analysis(
@@ -1087,7 +1087,6 @@ def compute_static_layout(rules, extractor: StaticFeatureExtractor, capabilities
otherwise, we may pollute the json document with
a large amount of un-referenced data.
"""
assert isinstance(extractor, StaticFeatureExtractor)
functions_by_bb: Dict[Address, Address] = {}
bbs_by_function: Dict[Address, List[Address]] = {}
for f in extractor.get_functions():

View File

@@ -54,10 +54,6 @@ def format_address(address: frz.Address) -> str:
assert isinstance(token, int)
assert isinstance(offset, int)
return f"token({capa.helpers.hex(token)})+{capa.helpers.hex(offset)}"
elif address.type == frz.AddressType.DYNAMIC:
assert isinstance(address.value, tuple)
ppid, pid, tid, id_, return_address = address.value
return f"process ppid: {ppid}, process pid: {pid}, thread id: {tid}, call: {id_}, return address: {capa.helpers.hex(return_address)}"
elif address.type == frz.AddressType.PROCESS:
assert isinstance(address.value, tuple)
ppid, pid = address.value
@@ -79,7 +75,7 @@ def format_address(address: frz.Address) -> str:
raise ValueError("unexpected address type")
def render_static_meta(ostream, doc: rd.ResultDocument):
def render_static_meta(ostream, meta: rd.Metadata):
"""
like:
@@ -100,33 +96,33 @@ def render_static_meta(ostream, doc: rd.ResultDocument):
total feature count 1918
"""
assert isinstance(doc.meta.analysis, rd.StaticAnalysis)
assert isinstance(meta.analysis, rd.StaticAnalysis)
rows = [
("md5", doc.meta.sample.md5),
("sha1", doc.meta.sample.sha1),
("sha256", doc.meta.sample.sha256),
("path", doc.meta.sample.path),
("timestamp", doc.meta.timestamp),
("capa version", doc.meta.version),
("os", doc.meta.analysis.os),
("format", doc.meta.analysis.format),
("arch", doc.meta.analysis.arch),
("analysis", doc.meta.flavor),
("extractor", doc.meta.analysis.extractor),
("base address", format_address(doc.meta.analysis.base_address)),
("rules", "\n".join(doc.meta.analysis.rules)),
("function count", len(doc.meta.analysis.feature_counts.functions)),
("library function count", len(doc.meta.analysis.library_functions)),
("md5", meta.sample.md5),
("sha1", meta.sample.sha1),
("sha256", meta.sample.sha256),
("path", meta.sample.path),
("timestamp", meta.timestamp),
("capa version", meta.version),
("os", meta.analysis.os),
("format", meta.analysis.format),
("arch", meta.analysis.arch),
("analysis", meta.flavor),
("extractor", meta.analysis.extractor),
("base address", format_address(meta.analysis.base_address)),
("rules", "\n".join(meta.analysis.rules)),
("function count", len(meta.analysis.feature_counts.functions)),
("library function count", len(meta.analysis.library_functions)),
(
"total feature count",
doc.meta.analysis.feature_counts.file + sum(f.count for f in doc.meta.analysis.feature_counts.functions),
meta.analysis.feature_counts.file + sum(f.count for f in meta.analysis.feature_counts.functions),
),
]
ostream.writeln(tabulate.tabulate(rows, tablefmt="plain"))
def render_dynamic_meta(ostream, doc: rd.ResultDocument):
def render_dynamic_meta(ostream, meta: rd.Metadata):
"""
like:
@@ -145,24 +141,24 @@ def render_dynamic_meta(ostream, doc: rd.ResultDocument):
total feature count 1918
"""
assert isinstance(doc.meta.analysis, rd.DynamicAnalysis)
assert isinstance(meta.analysis, rd.DynamicAnalysis)
rows = [
("md5", doc.meta.sample.md5),
("sha1", doc.meta.sample.sha1),
("sha256", doc.meta.sample.sha256),
("path", doc.meta.sample.path),
("timestamp", doc.meta.timestamp),
("capa version", doc.meta.version),
("os", doc.meta.analysis.os),
("format", doc.meta.analysis.format),
("arch", doc.meta.analysis.arch),
("analysis", doc.meta.flavor),
("extractor", doc.meta.analysis.extractor),
("rules", "\n".join(doc.meta.analysis.rules)),
("process count", len(doc.meta.analysis.feature_counts.processes)),
("md5", meta.sample.md5),
("sha1", meta.sample.sha1),
("sha256", meta.sample.sha256),
("path", meta.sample.path),
("timestamp", meta.timestamp),
("capa version", meta.version),
("os", meta.analysis.os),
("format", meta.analysis.format),
("arch", meta.analysis.arch),
("analysis", meta.flavor),
("extractor", meta.analysis.extractor),
("rules", "\n".join(meta.analysis.rules)),
("process count", len(meta.analysis.feature_counts.processes)),
(
"total feature count",
doc.meta.analysis.feature_counts.file + sum(p.count for p in doc.meta.analysis.feature_counts.processes),
meta.analysis.feature_counts.file + sum(p.count for p in meta.analysis.feature_counts.processes),
),
]
@@ -171,9 +167,9 @@ def render_dynamic_meta(ostream, doc: rd.ResultDocument):
def render_meta(osstream, doc: rd.ResultDocument):
if isinstance(doc.meta.analysis, rd.StaticAnalysis):
render_static_meta(osstream, doc)
render_static_meta(osstream, doc.meta)
elif isinstance(doc.meta.analysis, rd.DynamicAnalysis):
render_dynamic_meta(osstream, doc)
render_dynamic_meta(osstream, doc.meta)
else:
raise ValueError("invalid meta analysis")

View File

@@ -86,6 +86,10 @@ class Scope(str, Enum):
# not used to validate rules.
GLOBAL = "global"
@classmethod
def to_yaml(cls, representer, node):
return representer.represent_str(f"{node.value}")
# these literals are used to check if the flavor
# of a rule is correct.
@@ -979,6 +983,7 @@ class Rule:
# we use the ruamel.yaml parser because it supports roundtripping of documents with comments.
y = ruamel.yaml.YAML(typ="rt")
y.register_class(Scope)
# use block mode, not inline json-like mode
y.default_flow_style = False

2
rules

Submodule rules updated: 61bc8c7790...d923cf4b8f

View File

@@ -90,7 +90,7 @@ def main():
continue
if rule.meta.is_subscope_rule:
continue
if capa.rules.Scope.FUNCTION in rule.meta.scopes:
if rule.meta.scopes.static == capa.rules.Scope.FUNCTION:
continue
ns = rule.meta.namespace

View File

@@ -263,7 +263,6 @@ def assert_round_trip(rd: rdoc.ResultDocument):
pytest.param("a076114_rd"),
pytest.param("pma0101_rd"),
pytest.param("dotnet_1c444e_rd"),
pytest.param(""),
],
)
def test_round_trip(request, rd_file):