Merge pull request #401 from fireeye/linter-format

Lint rule formatting and improved rule dump
2025-12-12 15:49:46 -08:00 · 2021-01-28 09:18:20 -07:00
parent b5c2fb0259 0eb8d3e47c
commit 14e65c4601
3 changed files with 90 additions and 15 deletions
--- a/capa/rules.py
+++ b/capa/rules.py
@@ -6,6 +6,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.

+import re
 import uuid
 import codecs
 import logging
@@ -600,6 +601,9 @@ class Rule(object):
        # use block mode, not inline json-like mode
        y.default_flow_style = False

+        # leave quotes unchanged
+        y.preserve_quotes = True
+
        # indent lists by two spaces below their parent
        #
        #     features:
@@ -614,16 +618,20 @@ class Rule(object):
        return y

    @classmethod
-    def from_yaml(cls, s):
-        # use pyyaml because it can be much faster than ruamel (pure python)
-        doc = yaml.load(s, Loader=cls._get_yaml_loader())
+    def from_yaml(cls, s, use_ruamel=False):
+        if use_ruamel:
+            # ruamel enables nice formatting and doc roundtripping with comments
+            doc = cls._get_ruamel_yaml_parser().load(s)
+        else:
+            # use pyyaml because it can be much faster than ruamel (pure python)
+            doc = yaml.load(s, Loader=cls._get_yaml_loader())
        return cls.from_dict(doc, s)

    @classmethod
-    def from_yaml_file(cls, path):
+    def from_yaml_file(cls, path, use_ruamel=False):
        with open(path, "rb") as f:
            try:
-                return cls.from_yaml(f.read().decode("utf-8"))
+                return cls.from_yaml(f.read().decode("utf-8"), use_ruamel=use_ruamel)
            except InvalidRule as e:
                raise InvalidRuleWithPath(path, str(e))

@@ -716,7 +724,18 @@ class Rule(object):
        # tweaking `ruamel.indent()` doesn't quite give us the control we want.
        # so, add the two extra spaces that we've determined we need through experimentation.
        # see #263
-        doc = doc.replace("  description:", "    description:")
+        # only do this for the features section, so the meta description doesn't get reformatted
+        # assumes features section always exists
+        features_offset = doc.find("features")
+        doc = doc[:features_offset] + doc[features_offset:].replace("  description:", "    description:")
+
+        # for negative hex numbers, yaml dump outputs:
+        # - offset: !!int '0x-30'
+        # we prefer:
+        # - offset: -0x30
+        # the below regex makes these adjustments and while ugly, we don't have to explore the ruamel.yaml insides
+        doc = re.sub(r"!!int '0x-([0-9a-fA-F]+)'", r"-0x\1", doc)
+
        return doc


--- a/scripts/capafmt.py
+++ b/scripts/capafmt.py
@@ -38,6 +38,12 @@ def main(argv=None):
    )
    parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
    parser.add_argument("-q", "--quiet", action="store_true", help="Disable all output but errors")
+    parser.add_argument(
+        "-c",
+        "--check",
+        action="store_true",
+        help="Don't output (reformatted) rule, only return status. 0 = no changes, 1 = would reformat",
+    )
    args = parser.parse_args(args=argv)

    if args.verbose:
@@ -50,12 +56,22 @@ def main(argv=None):
    logging.basicConfig(level=level)
    logging.getLogger("capafmt").setLevel(level)

-    rule = capa.rules.Rule.from_yaml_file(args.path)
+    rule = capa.rules.Rule.from_yaml_file(args.path, use_ruamel=True)
+    reformatted_rule = rule.to_yaml()
+
+    if args.check:
+        if rule.definition == reformatted_rule:
+            logger.info("rule is formatted correctly, nice! (%s)", rule.name)
+            return 0
+        else:
+            logger.info("rule requires reformatting (%s)", rule.name)
+            return 1
+
    if args.in_place:
        with open(args.path, "wb") as f:
-            f.write(rule.to_yaml().encode("utf-8"))
+            f.write(reformatted_rule.encode("utf-8"))
    else:
-        print(rule.to_yaml().rstrip("\n"))
+        print(reformatted_rule)

    return 0

--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -17,6 +17,7 @@ import os
 import sys
 import time
 import string
+import difflib
 import hashlib
 import logging
 import os.path
@@ -25,6 +26,7 @@ import itertools
 import posixpath

 import capa.main
+import capa.rules
 import capa.engine
 import capa.features
 import capa.features.insn
@@ -277,6 +279,32 @@ class FeatureNegativeNumber(Lint):
        return False


+class FormatSingleEmptyLineEOF(Lint):
+    name = "EOF format"
+    recommendation = "end file with a single empty line"
+
+    def check_rule(self, ctx, rule):
+        if rule.definition.endswith("\n") and not rule.definition.endswith("\n\n"):
+            return False
+        return True
+
+
+class FormatIncorrect(Lint):
+    name = "rule format incorrect"
+    recommendation_template = "use scripts/capafmt.py or adjust as follows\n{:s}"
+
+    def check_rule(self, ctx, rule):
+        actual = rule.definition
+        expected = capa.rules.Rule.from_yaml(rule.definition, use_ruamel=True).to_yaml()
+
+        if actual != expected:
+            diff = difflib.ndiff(actual.splitlines(1), expected.splitlines(1))
+            self.recommendation = self.recommendation_template.format("".join(diff))
+            return True
+
+        return False
+
+
 def run_lints(lints, ctx, rule):
    for lint in lints:
        if lint.check_rule(ctx, rule):
@@ -332,15 +360,25 @@ FEATURE_LINTS = (
 )


-def get_normpath(path):
-    return posixpath.normpath(path).replace(os.sep, "/")
-
-
 def lint_features(ctx, rule):
    features = get_features(ctx, rule)
    return run_feature_lints(FEATURE_LINTS, ctx, features)


+FORMAT_LINTS = (
+    FormatSingleEmptyLineEOF(),
+    FormatIncorrect(),
+)
+
+
+def lint_format(ctx, rule):
+    return run_lints(FORMAT_LINTS, ctx, rule)
+
+
+def get_normpath(path):
+    return posixpath.normpath(path).replace(os.sep, "/")
+
+
 def get_features(ctx, rule):
    # get features from rule and all dependencies including subscopes and matched rules
    features = []
@@ -391,6 +429,7 @@ def lint_rule(ctx, rule):
            lint_meta(ctx, rule),
            lint_logic(ctx, rule),
            lint_features(ctx, rule),
+            lint_format(ctx, rule),
        )
    )

@@ -518,6 +557,7 @@ def main(argv=None):

    capa.main.set_vivisect_log_level(logging.CRITICAL)
    logging.getLogger("capa").setLevel(logging.CRITICAL)
+    logging.getLogger("viv_utils").setLevel(logging.CRITICAL)

    time0 = time.time()

@@ -549,8 +589,8 @@ def main(argv=None):

    did_violate = lint(ctx, rules)

-    diff = time.time() - time0
-    logger.debug("lint ran for ~ %02d:%02d", (diff // 60), diff)
+    min, sec = divmod(time.time() - time0, 60)
+    logger.debug("lints ran for ~ %02d:%02dm", min, sec)

    if not did_violate:
        logger.info("no suggestions, nice!")