Merge pull request #155 from fireeye/ana-desc-regex

Enable descriptions for regular expressions
2026-01-08 03:11:05 -08:00 · 2020-07-15 15:22:50 -06:00
parent 881ca88bfe 80ba19a466
commit 3bf030c2d4
7 changed files with 78 additions and 68 deletions
--- a/capa/engine.py
+++ b/capa/engine.py
@@ -1,6 +1,5 @@
 # Copyright (C) 2020 FireEye, Inc. All Rights Reserved.

-import re
 import sys
 import copy
 import collections
@@ -176,39 +175,6 @@ class Range(Statement):
            return "range(%s, min=%d, max=%d)" % (str(self.child), self.min, self.max)


-class Regex(Statement):
-    """match if the given pattern matches a String feature."""
-
-    def __init__(self, pattern):
-        super(Regex, self).__init__()
-        self.pattern = pattern
-        pat = self.pattern[len("/") : -len("/")]
-        flags = re.DOTALL
-        if pattern.endswith("/i"):
-            pat = self.pattern[len("/") : -len("/i")]
-            flags |= re.IGNORECASE
-        self.re = re.compile(pat, flags)
-        self.match = ""
-
-    def evaluate(self, ctx):
-        for feature, locations in ctx.items():
-            if not isinstance(feature, (capa.features.String,)):
-                continue
-
-            # `re.search` finds a match anywhere in the given string
-            # which implies leading and/or trailing whitespace.
-            # using this mode cleans is more convenient for rule authors,
-            # so that they don't have to prefix/suffix their terms like: /.*foo.*/.
-            if self.re.search(feature.value):
-                self.match = feature.value
-                return Result(True, self, [], locations=locations)
-
-        return Result(False, self, [])
-
-    def __str__(self):
-        return 'regex(string =~ %s, matched = "%s")' % (self.pattern, self.match)
-
-
 class Subscope(Statement):
    """
    a subscope element is a placeholder in a rule - it should not be evaluated directly.
--- a/capa/features/init.py
+++ b/capa/features/init.py
@@ -1,5 +1,6 @@
 # Copyright (C) 2020 FireEye, Inc. All Rights Reserved.

+import re
 import sys
 import codecs
 import logging
@@ -82,6 +83,50 @@ class String(Feature):
        super(String, self).__init__(value, description)


+class Regex(String):
+    def __init__(self, value, description=None):
+        super(Regex, self).__init__(value, description)
+        pat = self.value[len("/") : -len("/")]
+        flags = re.DOTALL
+        if value.endswith("/i"):
+            pat = self.value[len("/") : -len("/i")]
+            flags |= re.IGNORECASE
+        try:
+            self.re = re.compile(pat, flags)
+        except re.error:
+            if value.endswith("/i"):
+                value = value[: -len("i")]
+            raise ValueError(
+                "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value
+            )
+        self.match = None
+
+    def evaluate(self, ctx):
+        for feature, locations in ctx.items():
+            if not isinstance(feature, (capa.features.String,)):
+                continue
+
+            # `re.search` finds a match anywhere in the given string
+            # which implies leading and/or trailing whitespace.
+            # using this mode cleans is more convenient for rule authors,
+            # so that they don't have to prefix/suffix their terms like: /.*foo.*/.
+            if self.re.search(feature.value):
+                self.match = feature.value
+                return capa.engine.Result(True, self, [], locations=locations)
+
+        return capa.engine.Result(False, self, [])
+
+    def __str__(self):
+        return 'regex(string =~ %s, matched = "%s")' % (self.value, self.match)
+
+
+class StringFactory(object):
+    def __new__(self, value, description):
+        if value.startswith("/") and (value.endswith("/") or value.endswith("/i")):
+            return Regex(value, description)
+        return String(value, description)
+
+
 class Bytes(Feature):
    def __init__(self, value, description=None):
        super(Bytes, self).__init__(value, description)
--- a/capa/ida/explorer/model.py
+++ b/capa/ida/explorer/model.py
@@ -373,11 +373,6 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
            return parent2
        elif statement["type"] == "subscope":
            return CapaExplorerSubscopeItem(parent, statement[statement["type"]])
-        elif statement["type"] == "regex":
-            # regex is a `Statement` not a `Feature`
-            # this is because it doesn't get extracted, but applies to all strings in scope.
-            # so we have to handle it here
-            return CapaExplorerFeatureItem(parent, "regex(%s)" % statement["pattern"], details=statement["match"])
        else:
            raise RuntimeError("unexpected match statement type: " + str(statement))

@@ -496,7 +491,9 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):

        if len(locations) == 1:
            # only one location for feature so no need to nest children
-            parent2 = self.render_capa_doc_feature(parent, feature, next(iter(locations)), doc, display=display)
+            parent2 = self.render_capa_doc_feature(
+                parent, feature, next(iter(locations)), doc, display=display,
+            )
        else:
            # feature has multiple children, nest  under one parent feature node
            parent2 = CapaExplorerFeatureItem(parent, display)
@@ -539,6 +536,9 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
                parent, display, source=doc["rules"].get(feature[feature["type"]], {}).get("source", "")
            )

+        if feature["type"] == "regex":
+            return CapaExplorerFeatureItem(parent, display, location, details=feature["match"])
+
        if feature["type"] == "basicblock":
            return CapaExplorerBlockItem(parent, location)

--- a/capa/render/init.py
+++ b/capa/render/init.py
@@ -46,13 +46,6 @@ def convert_statement_to_result_document(statement):
            "max": statement.max,
            "child": convert_feature_to_result_document(statement.child),
        }
-    elif isinstance(statement, capa.engine.Regex):
-        return {
-            "type": "regex",
-            "pattern": statement.pattern,
-            # the string that was matched
-            "match": statement.match,
-        }
    elif isinstance(statement, capa.engine.Subscope):
        return {
            "type": "subscope",
@@ -90,7 +83,8 @@ def convert_feature_to_result_document(feature):
    result = {"type": feature.name, feature.name: feature.get_value_str()}
    if feature.description:
        result["description"] = feature.description
-
+    if feature.name == "regex":
+        result["match"] = feature.match
    return result


--- a/capa/render/vverbose.py
+++ b/capa/render/vverbose.py
@@ -70,11 +70,6 @@ def render_statement(ostream, match, statement, indent=0):
    elif statement["type"] == "subscope":
        ostream.write(statement["subscope"])
        ostream.writeln(":")
-    elif statement["type"] == "regex":
-        # regex is a `Statement` not a `Feature`
-        # this is because it doesn't get extracted, but applies to all strings in scope.
-        # so we have to handle it here
-        ostream.writeln("string: %s" % (statement["match"]))
    else:
        raise RuntimeError("unexpected match statement type: " + str(statement))

@@ -82,11 +77,17 @@ def render_statement(ostream, match, statement, indent=0):
 def render_feature(ostream, match, feature, indent=0):
    ostream.write("  " * indent)

-    ostream.write(feature["type"])
+    key = feature["type"]
+    value = feature[feature["type"]]
+    if key == "regex":
+        key = "string"  # render string for regex to mirror the rule source
+        value = feature["match"]  # the match provides more information than the value for regex
+
+    ostream.write(key)
    ostream.write(": ")

-    if feature[feature["type"]]:
-        ostream.write(rutils.bold2(feature[feature["type"]]))
+    if value:
+        ostream.write(rutils.bold2(value))

        if "description" in feature:
            ostream.write(capa.rules.DESCRIPTION_SEPARATOR)
--- a/capa/rules.py
+++ b/capa/rules.py
@@ -184,7 +184,7 @@ def parse_feature(key):
    if key == "api":
        return capa.features.insn.API
    elif key == "string":
-        return capa.features.String
+        return capa.features.StringFactory
    elif key == "bytes":
        return capa.features.Bytes
    elif key == "number":
@@ -348,19 +348,13 @@ def build_statements(d, scope):
            raise InvalidRule("unexpected range: %s" % (count))
    elif key == "string" and not isinstance(d[key], six.string_types):
        raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key])
-    elif key == "string" and d[key].startswith("/") and (d[key].endswith("/") or d[key].endswith("/i")):
-        try:
-            return Regex(d[key])
-        except re.error:
-            if d[key].endswith("/i"):
-                d[key] = d[key][: -len("i")]
-            raise InvalidRule(
-                "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % d[key]
-            )
    else:
        Feature = parse_feature(key)
        value, description = parse_description(d[key], key, d.get("description"))
-        feature = Feature(value, description)
+        try:
+            feature = Feature(value, description)
+        except ValueError as e:
+            raise InvalidRule(str(e))
        ensure_feature_valid_for_scope(scope, feature)
        return feature

--- a/tests/test_rules.py
+++ b/tests/test_rules.py
@@ -74,12 +74,22 @@ def test_rule_yaml_descriptions():
                    - number: 1 = This is the number 1
                    - string: This program cannot be run in DOS mode.
                      description: MS-DOS stub message
+                    - string: '/SELECT.*FROM.*WHERE/i'
+                      description: SQL WHERE Clause
                    - count(number(2 = AF_INET/SOCK_DGRAM)): 2
        """
    )
    r = capa.rules.Rule.from_yaml(rule)
    assert (
-        r.evaluate({Number(1): {1}, Number(2): {2, 3}, String("This program cannot be run in DOS mode."): {4}}) == True
+        r.evaluate(
+            {
+                Number(1): {1},
+                Number(2): {2, 3},
+                String("This program cannot be run in DOS mode."): {4},
+                String("SELECT password FROM hidden_table WHERE user == admin"): {5},
+            }
+        )
+        == True
    )