Merge branch 'dynamic-feature-extraction' into fix/dynamic-proto

2025-12-12 15:49:46 -08:00 · 2023-09-05 08:18:51 +00:00
parent 88ee6e661e dd0eadb438
commit 766b05e5c3
14 changed files with 86 additions and 86 deletions
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,6 +1,7 @@
 [submodule "rules"]
 	path = rules
 	url = ../capa-rules.git
+	branch = dynamic-syntax
 [submodule "tests/data"]
 	path = tests/data
 	url = ../capa-testfiles.git
--- a/capa/engine.py
+++ b/capa/engine.py
@@ -304,7 +304,7 @@ def match(rules: List["capa.rules.Rule"], features: FeatureSet, addr: Address) -
    other strategies can be imagined that match differently; implement these elsewhere.
    specifically, this routine does "top down" matching of the given rules against the feature set.
    """
-    results = collections.defaultdict(list)  # type: MatchResults
+    results: MatchResults = collections.defaultdict(list)

    # copy features so that we can modify it
    # without affecting the caller (keep this function pure)
--- a/capa/features/extractors/base_extractor.py
+++ b/capa/features/extractors/base_extractor.py
@@ -412,8 +412,6 @@ class DynamicFeatureExtractor:
        """
        Yields all the features of a process. These include:
        - file features of the process' image
-        - inter-process injection
-        - detected dynamic DLL loading
        """
        raise NotImplementedError()

@@ -429,8 +427,6 @@ class DynamicFeatureExtractor:
        """
        Yields all the features of a thread. These include:
        - sequenced api traces
-        - file/registry interactions
-        - network activity
        """
        raise NotImplementedError()

--- a/capa/features/extractors/binja/extractor.py
+++ b/capa/features/extractors/binja/extractor.py
@@ -6,7 +6,6 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 from typing import List, Tuple, Iterator
-from pathlib import Path

 import binaryninja as binja

@@ -29,7 +28,7 @@ from capa.features.extractors.base_extractor import (

 class BinjaFeatureExtractor(StaticFeatureExtractor):
    def __init__(self, bv: binja.BinaryView):
-        super().__init__(hashes=SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes()))
+        super().__init__(hashes=SampleHashes.from_bytes(bv.file.raw.read(0, len(bv.file.raw))))
        self.bv = bv
        self.global_features: List[Tuple[Feature, Address]] = []
        self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv))
--- a/capa/features/extractors/helpers.py
+++ b/capa/features/extractors/helpers.py
@@ -55,8 +55,8 @@ def generate_symbols(dll: str, symbol: str) -> Iterator[str]:
    dll = dll.lower()

    # trim extensions observed in dynamic traces
-    dll = dll.replace(".dll", "")
-    dll = dll.replace(".drv", "")
+    dll = dll[0:-4] if dll.endswith(".dll") else dll
+    dll = dll[0:-4] if dll.endswith(".drv") else dll

    # kernel32.CreateFileA
    yield f"{dll}.{symbol}"
--- a/capa/features/extractors/null.py
+++ b/capa/features/extractors/null.py
@@ -59,6 +59,9 @@ class NullStaticFeatureExtractor(StaticFeatureExtractor):
    def get_base_address(self):
        return self.base_address

+    def get_sample_hashes(self) -> SampleHashes:
+        return self.sample_hashes
+
    def extract_global_features(self):
        for feature in self.global_features:
            yield feature, NO_ADDRESS
@@ -121,6 +124,9 @@ class NullDynamicFeatureExtractor(DynamicFeatureExtractor):
        for feature in self.global_features:
            yield feature, NO_ADDRESS

+    def get_sample_hashes(self) -> SampleHashes:
+        return self.sample_hashes
+
    def extract_file_features(self):
        for address, feature in self.file_features:
            yield feature, address
--- a/capa/features/freeze/init.py
+++ b/capa/features/freeze/init.py
@@ -53,7 +53,6 @@ class AddressType(str, Enum):
    PROCESS = "process"
    THREAD = "thread"
    CALL = "call"
-    DYNAMIC = "dynamic"
    NO_ADDRESS = "no address"


@@ -424,7 +423,7 @@ def dumps_static(extractor: StaticFeatureExtractor) -> str:
    # Mypy is unable to recognise `global_` as a argument due to alias

    freeze = Freeze(
-        version=2,
+        version=3,
        base_address=Address.from_capa(extractor.get_base_address()),
        sample_hashes=extractor.get_sample_hashes(),
        extractor=Extractor(name=extractor.__class__.__name__),
@@ -528,7 +527,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
    base_addr = get_base_addr() if get_base_addr else capa.features.address.NO_ADDRESS

    freeze = Freeze(
-        version=2,
+        version=3,
        base_address=Address.from_capa(base_addr),
        sample_hashes=extractor.get_sample_hashes(),
        extractor=Extractor(name=extractor.__class__.__name__),
@@ -542,7 +541,7 @@ def dumps_dynamic(extractor: DynamicFeatureExtractor) -> str:
 def loads_static(s: str) -> StaticFeatureExtractor:
    """deserialize a set of features (as a NullStaticFeatureExtractor) from a string."""
    freeze = Freeze.model_validate_json(s)
-    if freeze.version != 2:
+    if freeze.version != 3:
        raise ValueError(f"unsupported freeze format version: {freeze.version}")

    assert isinstance(freeze.features, StaticFeatures)
@@ -575,7 +574,7 @@ def loads_static(s: str) -> StaticFeatureExtractor:
 def loads_dynamic(s: str) -> DynamicFeatureExtractor:
    """deserialize a set of features (as a NullDynamicFeatureExtractor) from a string."""
    freeze = Freeze.parse_raw(s)
-    if freeze.version != 2:
+    if freeze.version != 3:
        raise ValueError(f"unsupported freeze format version: {freeze.version}")

    assert isinstance(freeze.features, DynamicFeatures)
@@ -624,11 +623,11 @@ def is_freeze(buf: bytes) -> bool:
    return buf[: len(MAGIC)] == MAGIC


-def is_static(buf: bytes) -> bool:
+def is_static_freeze(buf: bytes) -> bool:
    return buf[: len(STATIC_MAGIC)] == STATIC_MAGIC


-def is_dynamic(buf: bytes) -> bool:
+def is_dynamic_freeze(buf: bytes) -> bool:
    return buf[: len(DYNAMIC_MAGIC)] == DYNAMIC_MAGIC


@@ -636,9 +635,9 @@ def load(buf: bytes):
    """deserialize a set of features (as a NullFeatureExtractor) from a byte array."""
    if not is_freeze(buf):
        raise ValueError("missing magic header")
-    if is_static(buf):
+    if is_static_freeze(buf):
        return loads_static(zlib.decompress(buf[len(STATIC_MAGIC) :]).decode("utf-8"))
-    elif is_dynamic(buf):
+    elif is_dynamic_freeze(buf):
        return loads_dynamic(zlib.decompress(buf[len(DYNAMIC_MAGIC) :]).decode("utf-8"))
    else:
        raise ValueError("invalid magic header")
--- a/capa/main.py
+++ b/capa/main.py
@@ -145,7 +145,7 @@ def find_instruction_capabilities(
    returns: tuple containing (features for instruction, match results for instruction)
    """
    # all features found for the instruction.
-    features = collections.defaultdict(set)  # type: FeatureSet
+    features: FeatureSet = collections.defaultdict(set)

    for feature, addr in itertools.chain(
        extractor.extract_insn_features(f, bb, insn), extractor.extract_global_features()
@@ -173,11 +173,11 @@ def find_basic_block_capabilities(
    """
    # all features found within this basic block,
    # includes features found within instructions.
-    features = collections.defaultdict(set)  # type: FeatureSet
+    features: FeatureSet = collections.defaultdict(set)

    # matches found at the instruction scope.
    # might be found at different instructions, thats ok.
-    insn_matches = collections.defaultdict(list)  # type: MatchResults
+    insn_matches: MatchResults = collections.defaultdict(list)

    for insn in extractor.get_instructions(f, bb):
        ifeatures, imatches = find_instruction_capabilities(ruleset, extractor, f, bb, insn)
@@ -213,15 +213,15 @@ def find_code_capabilities(
    """
    # all features found within this function,
    # includes features found within basic blocks (and instructions).
-    function_features = collections.defaultdict(set)  # type: FeatureSet
+    function_features: FeatureSet = collections.defaultdict(set)

    # matches found at the basic block scope.
    # might be found at different basic blocks, thats ok.
-    bb_matches = collections.defaultdict(list)  # type: MatchResults
+    bb_matches: MatchResults = collections.defaultdict(list)

    # matches found at the instruction scope.
    # might be found at different instructions, thats ok.
-    insn_matches = collections.defaultdict(list)  # type: MatchResults
+    insn_matches: MatchResults = collections.defaultdict(list)

    for bb in extractor.get_basic_blocks(fh):
        features, bmatches, imatches = find_basic_block_capabilities(ruleset, extractor, fh, bb)
@@ -242,7 +242,7 @@ def find_code_capabilities(


 def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet):
-    file_features = collections.defaultdict(set)  # type: FeatureSet
+    file_features: FeatureSet = collections.defaultdict(set)

    for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()):
        # not all file features may have virtual addresses.
@@ -265,9 +265,9 @@ def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, functi
 def find_static_capabilities(
    ruleset: RuleSet, extractor: StaticFeatureExtractor, disable_progress=None
 ) -> Tuple[MatchResults, Any]:
-    all_function_matches = collections.defaultdict(list)  # type: MatchResults
-    all_bb_matches = collections.defaultdict(list)  # type: MatchResults
-    all_insn_matches = collections.defaultdict(list)  # type: MatchResults
+    all_function_matches: MatchResults = collections.defaultdict(list)
+    all_bb_matches: MatchResults = collections.defaultdict(list)
+    all_insn_matches: MatchResults = collections.defaultdict(list)

    feature_counts = rdoc.StaticFeatureCounts(file=0, functions=())
    library_functions: Tuple[rdoc.LibraryFunction, ...] = ()
@@ -328,7 +328,7 @@ def find_static_capabilities(

    # collection of features that captures the rule matches within function, BB, and instruction scopes.
    # mapping from feature (matched rule) to set of addresses at which it matched.
-    function_and_lower_features = collections.defaultdict(set)  # type: FeatureSet
+    function_and_lower_features: FeatureSet = collections.defaultdict(set)
    for rule_name, results in itertools.chain(
        all_function_matches.items(), all_bb_matches.items(), all_insn_matches.items()
    ):
@@ -368,7 +368,7 @@ def find_call_capabilities(
    returns: tuple containing (features for call, match results for call)
    """
    # all features found for the call.
-    features = collections.defaultdict(set)  # type: FeatureSet
+    features: FeatureSet = collections.defaultdict(set)

    for feature, addr in itertools.chain(
        extractor.extract_call_features(ph, th, ch), extractor.extract_global_features()
@@ -396,11 +396,11 @@ def find_thread_capabilities(
    """
    # all features found within this thread,
    # includes features found within calls.
-    features = collections.defaultdict(set)  # type: FeatureSet
+    features: FeatureSet = collections.defaultdict(set)

    # matches found at the call scope.
    # might be found at different calls, thats ok.
-    call_matches = collections.defaultdict(list)  # type: MatchResults
+    call_matches: MatchResults = collections.defaultdict(list)

    for ch in extractor.get_calls(ph, th):
        ifeatures, imatches = find_call_capabilities(ruleset, extractor, ph, th, ch)
@@ -434,15 +434,15 @@ def find_process_capabilities(
    """
    # all features found within this process,
    # includes features found within threads (and calls).
-    process_features = collections.defaultdict(set)  # type: FeatureSet
+    process_features: FeatureSet = collections.defaultdict(set)

    # matches found at the basic threads.
    # might be found at different threads, thats ok.
-    thread_matches = collections.defaultdict(list)  # type: MatchResults
+    thread_matches: MatchResults = collections.defaultdict(list)

    # matches found at the call scope.
    # might be found at different calls, thats ok.
-    call_matches = collections.defaultdict(list)  # type: MatchResults
+    call_matches: MatchResults = collections.defaultdict(list)

    for th in extractor.get_threads(ph):
        features, tmatches, cmatches = find_thread_capabilities(ruleset, extractor, ph, th)
@@ -465,9 +465,9 @@ def find_process_capabilities(
 def find_dynamic_capabilities(
    ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress=None
 ) -> Tuple[MatchResults, Any]:
-    all_process_matches = collections.defaultdict(list)  # type: MatchResults
-    all_thread_matches = collections.defaultdict(list)  # type: MatchResults
-    all_call_matches = collections.defaultdict(list)  # type: MatchResults
+    all_process_matches: MatchResults = collections.defaultdict(list)
+    all_thread_matches: MatchResults = collections.defaultdict(list)
+    all_call_matches: MatchResults = collections.defaultdict(list)

    feature_counts = rdoc.DynamicFeatureCounts(file=0, processes=())

@@ -502,7 +502,7 @@ def find_dynamic_capabilities(

    # collection of features that captures the rule matches within process and thread scopes.
    # mapping from feature (matched rule) to set of addresses at which it matched.
-    process_and_lower_features = collections.defaultdict(set)  # type: FeatureSet
+    process_and_lower_features: FeatureSet = collections.defaultdict(set)
    for rule_name, results in itertools.chain(
        all_process_matches.items(), all_thread_matches.items(), all_call_matches.items()
    ):
@@ -902,7 +902,7 @@ def get_rules(
    if ruleset is not None:
        return ruleset

-    rules = []  # type: List[Rule]
+    rules: List[Rule] = []

    total_rule_count = len(rule_file_paths)
    for i, (path, content) in enumerate(zip(rule_file_paths, rule_contents)):
@@ -1021,7 +1021,7 @@ def collect_metadata(
            md5=md5,
            sha1=sha1,
            sha256=sha256,
-            path=str(Path(sample_path).resolve()),
+            path=Path(sample_path).resolve().as_posix(),
        ),
        flavor=flavor,
        analysis=get_sample_analysis(
@@ -1087,7 +1087,6 @@ def compute_static_layout(rules, extractor: StaticFeatureExtractor, capabilities
    otherwise, we may pollute the json document with
    a large amount of un-referenced data.
    """
-    assert isinstance(extractor, StaticFeatureExtractor)
    functions_by_bb: Dict[Address, Address] = {}
    bbs_by_function: Dict[Address, List[Address]] = {}
    for f in extractor.get_functions():
--- a/capa/render/verbose.py
+++ b/capa/render/verbose.py
@@ -54,10 +54,6 @@ def format_address(address: frz.Address) -> str:
        assert isinstance(token, int)
        assert isinstance(offset, int)
        return f"token({capa.helpers.hex(token)})+{capa.helpers.hex(offset)}"
-    elif address.type == frz.AddressType.DYNAMIC:
-        assert isinstance(address.value, tuple)
-        ppid, pid, tid, id_, return_address = address.value
-        return f"process ppid: {ppid}, process pid: {pid}, thread id: {tid}, call: {id_}, return address: {capa.helpers.hex(return_address)}"
    elif address.type == frz.AddressType.PROCESS:
        assert isinstance(address.value, tuple)
        ppid, pid = address.value
@@ -79,7 +75,7 @@ def format_address(address: frz.Address) -> str:
        raise ValueError("unexpected address type")


-def render_static_meta(ostream, doc: rd.ResultDocument):
+def render_static_meta(ostream, meta: rd.Metadata):
    """
    like:

@@ -100,33 +96,33 @@ def render_static_meta(ostream, doc: rd.ResultDocument):
        total feature count  1918
    """

-    assert isinstance(doc.meta.analysis, rd.StaticAnalysis)
+    assert isinstance(meta.analysis, rd.StaticAnalysis)
    rows = [
-        ("md5", doc.meta.sample.md5),
-        ("sha1", doc.meta.sample.sha1),
-        ("sha256", doc.meta.sample.sha256),
-        ("path", doc.meta.sample.path),
-        ("timestamp", doc.meta.timestamp),
-        ("capa version", doc.meta.version),
-        ("os", doc.meta.analysis.os),
-        ("format", doc.meta.analysis.format),
-        ("arch", doc.meta.analysis.arch),
-        ("analysis", doc.meta.flavor),
-        ("extractor", doc.meta.analysis.extractor),
-        ("base address", format_address(doc.meta.analysis.base_address)),
-        ("rules", "\n".join(doc.meta.analysis.rules)),
-        ("function count", len(doc.meta.analysis.feature_counts.functions)),
-        ("library function count", len(doc.meta.analysis.library_functions)),
+        ("md5", meta.sample.md5),
+        ("sha1", meta.sample.sha1),
+        ("sha256", meta.sample.sha256),
+        ("path", meta.sample.path),
+        ("timestamp", meta.timestamp),
+        ("capa version", meta.version),
+        ("os", meta.analysis.os),
+        ("format", meta.analysis.format),
+        ("arch", meta.analysis.arch),
+        ("analysis", meta.flavor),
+        ("extractor", meta.analysis.extractor),
+        ("base address", format_address(meta.analysis.base_address)),
+        ("rules", "\n".join(meta.analysis.rules)),
+        ("function count", len(meta.analysis.feature_counts.functions)),
+        ("library function count", len(meta.analysis.library_functions)),
        (
            "total feature count",
-            doc.meta.analysis.feature_counts.file + sum(f.count for f in doc.meta.analysis.feature_counts.functions),
+            meta.analysis.feature_counts.file + sum(f.count for f in meta.analysis.feature_counts.functions),
        ),
    ]

    ostream.writeln(tabulate.tabulate(rows, tablefmt="plain"))


-def render_dynamic_meta(ostream, doc: rd.ResultDocument):
+def render_dynamic_meta(ostream, meta: rd.Metadata):
    """
    like:

@@ -145,24 +141,24 @@ def render_dynamic_meta(ostream, doc: rd.ResultDocument):
        total feature count  1918
    """

-    assert isinstance(doc.meta.analysis, rd.DynamicAnalysis)
+    assert isinstance(meta.analysis, rd.DynamicAnalysis)
    rows = [
-        ("md5", doc.meta.sample.md5),
-        ("sha1", doc.meta.sample.sha1),
-        ("sha256", doc.meta.sample.sha256),
-        ("path", doc.meta.sample.path),
-        ("timestamp", doc.meta.timestamp),
-        ("capa version", doc.meta.version),
-        ("os", doc.meta.analysis.os),
-        ("format", doc.meta.analysis.format),
-        ("arch", doc.meta.analysis.arch),
-        ("analysis", doc.meta.flavor),
-        ("extractor", doc.meta.analysis.extractor),
-        ("rules", "\n".join(doc.meta.analysis.rules)),
-        ("process count", len(doc.meta.analysis.feature_counts.processes)),
+        ("md5", meta.sample.md5),
+        ("sha1", meta.sample.sha1),
+        ("sha256", meta.sample.sha256),
+        ("path", meta.sample.path),
+        ("timestamp", meta.timestamp),
+        ("capa version", meta.version),
+        ("os", meta.analysis.os),
+        ("format", meta.analysis.format),
+        ("arch", meta.analysis.arch),
+        ("analysis", meta.flavor),
+        ("extractor", meta.analysis.extractor),
+        ("rules", "\n".join(meta.analysis.rules)),
+        ("process count", len(meta.analysis.feature_counts.processes)),
        (
            "total feature count",
-            doc.meta.analysis.feature_counts.file + sum(p.count for p in doc.meta.analysis.feature_counts.processes),
+            meta.analysis.feature_counts.file + sum(p.count for p in meta.analysis.feature_counts.processes),
        ),
    ]

@@ -171,9 +167,9 @@ def render_dynamic_meta(ostream, doc: rd.ResultDocument):

 def render_meta(osstream, doc: rd.ResultDocument):
    if isinstance(doc.meta.analysis, rd.StaticAnalysis):
-        render_static_meta(osstream, doc)
+        render_static_meta(osstream, doc.meta)
    elif isinstance(doc.meta.analysis, rd.DynamicAnalysis):
-        render_dynamic_meta(osstream, doc)
+        render_dynamic_meta(osstream, doc.meta)
    else:
        raise ValueError("invalid meta analysis")

--- a/capa/rules/init.py
+++ b/capa/rules/init.py
@@ -86,6 +86,10 @@ class Scope(str, Enum):
    # not used to validate rules.
    GLOBAL = "global"

+    @classmethod
+    def to_yaml(cls, representer, node):
+        return representer.represent_str(f"{node.value}")
+

 # these literals are used to check if the flavor
 # of a rule is correct.
@@ -979,6 +983,7 @@ class Rule:

        # we use the ruamel.yaml parser because it supports roundtripping of documents with comments.
        y = ruamel.yaml.YAML(typ="rt")
+        y.register_class(Scope)

        # use block mode, not inline json-like mode
        y.default_flow_style = False
--- a/2
+++ b/2
--- a/scripts/import-to-ida.py
+++ b/scripts/import-to-ida.py
@@ -90,7 +90,7 @@ def main():
            continue
        if rule.meta.is_subscope_rule:
            continue
-        if capa.rules.Scope.FUNCTION in rule.meta.scopes:
+        if rule.meta.scopes.static == capa.rules.Scope.FUNCTION:
            continue

        ns = rule.meta.namespace
--- a/tests/data
+++ b/tests/data
--- a/tests/test_result_document.py
+++ b/tests/test_result_document.py
@@ -263,7 +263,6 @@ def assert_round_trip(rd: rdoc.ResultDocument):
        pytest.param("a076114_rd"),
        pytest.param("pma0101_rd"),
        pytest.param("dotnet_1c444e_rd"),
-        pytest.param(""),
    ],
 )
 def test_round_trip(request, rd_file):