capa/scripts/detect_duplicate_features.py

# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
#  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import sys
import logging
import argparse
from typing import Set
from pathlib import Path

import capa.main
import capa.rules
from capa.features.common import Feature

logger = logging.getLogger("detect_duplicate_features")


def get_features(rule_path: str) -> Set[Feature]:
    """
    Extracts all features from a given rule file.

    Args:
        rule_path (str): The path to the rule file to extract features from.

    Returns:
        set: A set of all feature statements contained within the rule file.
    """
    with Path(rule_path).open("r", encoding="utf-8") as f:
        try:
            new_rule = capa.rules.Rule.from_yaml(f.read())
            return new_rule.extract_all_features()
        except Exception as e:
            logger.error("Error: New rule %s %s %s", rule_path, str(type(e)), str(e))
            sys.exit(-1)


def find_overlapping_rules(new_rule_path, rules_path):
    if not new_rule_path.endswith(".yml"):
        logger.error("FileNotFoundError ! New rule file name doesn't end with .yml")
        sys.exit(-1)

    # Loads features of new rule in a list.
    new_rule_features = get_features(new_rule_path)
    count = 0
    overlapping_rules = []

    # capa.rules.RuleSet stores all rules in given paths
    ruleset = capa.rules.get_rules(rules_path)

    for rule_name, rule in ruleset.rules.items():
        rule_features = rule.extract_all_features()

        if not len(rule_features):
            continue
        count += 1
        # Checks if any features match between existing and new rule.
        if any(feature in rule_features for feature in new_rule_features):
            overlapping_rules.append(rule_name)

    result = {"overlapping_rules": overlapping_rules, "count": count}
    return result


def main():
    parser = argparse.ArgumentParser(description="Find overlapping features in Capa rules.")

    parser.add_argument("rules", type=str, action="append", help="Path to rules")
    parser.add_argument("new_rule", type=str, help="Path to new rule")

    args = parser.parse_args()

    new_rule_path = args.new_rule
    rules_path = [Path(rule) for rule in args.rules]

    result = find_overlapping_rules(new_rule_path, rules_path)

    print("\nNew rule path : %s" % new_rule_path)
    print("Number of rules checked : %s " % result["count"])
    if result["overlapping_rules"]:
        print("Paths to overlapping rules : ")
        for r in result["overlapping_rules"]:
            print("- %s" % r)
    else:
        print("Paths to overlapping rules : None")
    print("Number of rules containing same features : %s" % len(result["overlapping_rules"]))
    print("\n")

    return len(result["overlapping_rules"])


if __name__ == "__main__":
    sys.exit(main())