rules: ruleset: add optimized match routine

2025-12-12 15:49:46 -08:00 · 2021-11-09 09:52:32 -07:00
parent 1311da99ff
commit e647ae2ac4
1 changed files with 141 additions and 1 deletions
--- a/capa/rules.py
+++ b/capa/rules.py
@@ -23,7 +23,7 @@ except ImportError:
    # https://github.com/python/mypy/issues/1153
    from backports.functools_lru_cache import lru_cache  # type: ignore

-from typing import Any, Dict, List, Union, Iterator
+from typing import Any, Set, Dict, List, Tuple, Union, Iterator

 import yaml
 import ruamel.yaml
@@ -974,6 +974,15 @@ class RuleSet:
        self.rules = {rule.name: rule for rule in rules}
        self.rules_by_namespace = index_rules_by_namespace(rules)

+        # unstable
+        (self._easy_file_rules_by_feature, self._hard_file_rules) = self._index_rules_by_feature(self.file_rules)
+        (self._easy_function_rules_by_feature, self._hard_function_rules) = self._index_rules_by_feature(
+            self.function_rules
+        )
+        (self._easy_basic_block_rules_by_feature, self._hard_basic_block_rules) = self._index_rules_by_feature(
+            self.basic_block_rules
+        )
+
    def __len__(self):
        return len(self.rules)

@@ -983,6 +992,88 @@ class RuleSet:
    def __contains__(self, rulename):
        return rulename in self.rules

+    @staticmethod
+    def _index_rules_by_feature(rules) -> Tuple[Dict[Feature, Set[str]], List[str]]:
+        """
+        split the given rules into to structures:
+          - "easy rules" are indexed by feature,
+            such that you can quickly find the rules that contain a given feature.
+          - "hard rules" are those that contain substring/regex/bytes features or match statements.
+            these continue to be ordered topologically.
+
+        a rule evaluator can use the "easy rule" index to restrict the
+        candidate rules that might match a given set of features.
+
+        at this time, a rule evaluator can't do anything special with
+        the "hard rules". it must still do a full top-down match of each
+        rule, in topological order.
+        """
+
+        # we'll do a couple phases:
+        #
+        #  1. recursively visit all nodes in all rules,
+        #    a. indexing all features
+        #    b. recording the types of features found per rule
+        #  2. compute the easy and hard rule sets
+        #  3. remove hard rules from the rules-by-feature index
+        #  4. construct the topologically ordered list of hard rules
+        rules_with_easy_features: Set[str] = set()
+        rules_with_hard_features: Set[str] = set()
+        rules_by_feature: Dict[Feature, Set[str]] = collections.defaultdict(set)
+
+        def rec(rule: str, node: Union[Feature, Statement]):
+            if isinstance(
+                node,
+                (
+                    # these are the "hard features"
+                    # substring: scanning feature
+                    capa.features.common.Substring,
+                    # regex: scanning feature
+                    capa.features.common.Regex,
+                    # bytes: scanning feature
+                    capa.features.common.Bytes,
+                    # match: dependency on another rule,
+                    # which we have to evaluate first,
+                    # and is therefore tricky.
+                    capa.features.common.MatchedRule,
+                ),
+            ):
+                # hard feature: requires scan or match lookup
+                rules_with_hard_features.add(rule)
+            elif isinstance(node, capa.features.common.Feature):
+                # easy feature: hash lookup
+                rules_with_easy_features.add(rule)
+                rules_by_feature[node].add(rule)
+            elif isinstance(node, (ceng.Not, ceng.Range)):
+                return rec(rule, node.child)
+            elif isinstance(node, (ceng.And, ceng.Or, ceng.Some)):
+                for child in node.children:
+                    rec(rule, child)
+            else:
+                # programming error
+                raise Exception("programming error: unexpected node type: %s" % (node))
+
+        for rule in rules:
+            rec(rule.meta["name"], rule.statement)
+
+        # if a rule has a hard feature,
+        # dont consider it easy, and therefore,
+        # don't index any of its features.
+        #
+        # otherwise, its an easy rule, and index its features
+        for rules_with_feature in rules_by_feature.values():
+            rules_with_feature.difference_update(rules_with_hard_features)
+        easy_rules_by_feature = rules_by_feature
+
+        # `rules` is already topologically ordered,
+        # so extract our hard set into the topological ordering.
+        hard_rules = []
+        for rule in rules:
+            if rule.meta["name"] in rules_with_hard_features:
+                hard_rules.append(rule.meta["name"])
+
+        return (easy_rules_by_feature, hard_rules)
+
    @staticmethod
    def _get_rules_for_scope(rules, scope):
        """
@@ -1045,3 +1136,52 @@ class RuleSet:
                    rules_filtered.update(set(capa.rules.get_rules_and_dependencies(rules, rule.name)))
                    break
        return RuleSet(list(rules_filtered))
+
+    def match(self, scope: Scope, features: FeatureSet, va: int) -> Tuple[FeatureSet, ceng.MatchResults]:
+        """
+        match rules from this ruleset at the given scope against the given features.
+
+        this routine should act just like `capa.engine.match`,
+        except that it may be more performant.
+        """
+        if scope == scope.FILE:
+            easy_rules_by_feature = self._easy_file_rules_by_feature
+            hard_rule_names = self._hard_file_rules
+        elif scope == scope.FUNCTION:
+            easy_rules_by_feature = self._easy_function_rules_by_feature
+            hard_rule_names = self._hard_function_rules
+        elif scope == scope.BASIC_BLOCK:
+            easy_rules_by_feature = self._easy_basic_block_rules_by_feature
+            hard_rule_names = self._hard_basic_block_rules
+        else:
+            raise Exception("programming error: unexpected scope")
+
+        candidate_rule_names = set()
+        for feature in features:
+            easy_rules = easy_rules_by_feature.get(feature)
+            if easy_rules:
+                candidate_rule_names.update(easy_rules)
+
+        # first, match against the set of rules that have at least one
+        # feature shared with our feature set.
+        candidate_rules = [self.rules[name] for name in candidate_rule_names]
+        features2, easy_matches = ceng.match(candidate_rules, features, va)
+
+        # now, match against (topologically ordered) list of rules
+        # that we can't really make any guesses about.
+        # these are rules with hard features, like substring/regex/bytes and match statements.
+        hard_rules = [self.rules[name] for name in hard_rule_names]
+        features3, hard_matches = ceng.match(hard_rules, features2, va)
+
+        # note that above, we ideally skipping matching a bunch of
+        # rules that probably would never hit.
+        # specifically, "easy rules" that don't share any features with
+        # feature set.
+
+        # MatchResults doesn't technically have an .update() method
+        # but a dict does.
+        matches = {}  # type: ignore
+        matches.update(easy_matches)
+        matches.update(hard_matches)
+
+        return (features3, matches)