diff --git a/capa/engine.py b/capa/engine.py index 34bb5cc7..8e5f0a3c 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -1,6 +1,5 @@ # Copyright (C) 2020 FireEye, Inc. All Rights Reserved. -import re import sys import copy import collections @@ -176,39 +175,6 @@ class Range(Statement): return "range(%s, min=%d, max=%d)" % (str(self.child), self.min, self.max) -class Regex(Statement): - """match if the given pattern matches a String feature.""" - - def __init__(self, pattern): - super(Regex, self).__init__() - self.pattern = pattern - pat = self.pattern[len("/") : -len("/")] - flags = re.DOTALL - if pattern.endswith("/i"): - pat = self.pattern[len("/") : -len("/i")] - flags |= re.IGNORECASE - self.re = re.compile(pat, flags) - self.match = "" - - def evaluate(self, ctx): - for feature, locations in ctx.items(): - if not isinstance(feature, (capa.features.String,)): - continue - - # `re.search` finds a match anywhere in the given string - # which implies leading and/or trailing whitespace. - # using this mode cleans is more convenient for rule authors, - # so that they don't have to prefix/suffix their terms like: /.*foo.*/. - if self.re.search(feature.value): - self.match = feature.value - return Result(True, self, [], locations=locations) - - return Result(False, self, []) - - def __str__(self): - return 'regex(string =~ %s, matched = "%s")' % (self.pattern, self.match) - - class Subscope(Statement): """ a subscope element is a placeholder in a rule - it should not be evaluated directly. diff --git a/capa/features/__init__.py b/capa/features/__init__.py index e72daffd..40773a51 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -1,5 +1,6 @@ # Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +import re import sys import codecs import logging @@ -82,6 +83,50 @@ class String(Feature): super(String, self).__init__(value, description) +class Regex(String): + def __init__(self, value, description=None): + super(Regex, self).__init__(value, description) + pat = self.value[len("/") : -len("/")] + flags = re.DOTALL + if value.endswith("/i"): + pat = self.value[len("/") : -len("/i")] + flags |= re.IGNORECASE + try: + self.re = re.compile(pat, flags) + except re.error: + if value.endswith("/i"): + value = value[: -len("i")] + raise ValueError( + "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value + ) + self.match = None + + def evaluate(self, ctx): + for feature, locations in ctx.items(): + if not isinstance(feature, (capa.features.String,)): + continue + + # `re.search` finds a match anywhere in the given string + # which implies leading and/or trailing whitespace. + # using this mode cleans is more convenient for rule authors, + # so that they don't have to prefix/suffix their terms like: /.*foo.*/. + if self.re.search(feature.value): + self.match = feature.value + return capa.engine.Result(True, self, [], locations=locations) + + return capa.engine.Result(False, self, []) + + def __str__(self): + return 'regex(string =~ %s, matched = "%s")' % (self.value, self.match) + + +class StringFactory(object): + def __new__(self, value, description): + if value.startswith("/") and (value.endswith("/") or value.endswith("/i")): + return Regex(value, description) + return String(value, description) + + class Bytes(Feature): def __init__(self, value, description=None): super(Bytes, self).__init__(value, description) diff --git a/capa/ida/explorer/model.py b/capa/ida/explorer/model.py index 93949d16..26e387c5 100644 --- a/capa/ida/explorer/model.py +++ b/capa/ida/explorer/model.py @@ -373,11 +373,6 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): return parent2 elif statement["type"] == "subscope": return CapaExplorerSubscopeItem(parent, statement[statement["type"]]) - elif statement["type"] == "regex": - # regex is a `Statement` not a `Feature` - # this is because it doesn't get extracted, but applies to all strings in scope. - # so we have to handle it here - return CapaExplorerFeatureItem(parent, "regex(%s)" % statement["pattern"], details=statement["match"]) else: raise RuntimeError("unexpected match statement type: " + str(statement)) @@ -496,7 +491,9 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): if len(locations) == 1: # only one location for feature so no need to nest children - parent2 = self.render_capa_doc_feature(parent, feature, next(iter(locations)), doc, display=display) + parent2 = self.render_capa_doc_feature( + parent, feature, next(iter(locations)), doc, display=display, + ) else: # feature has multiple children, nest under one parent feature node parent2 = CapaExplorerFeatureItem(parent, display) @@ -539,6 +536,9 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): parent, display, source=doc["rules"].get(feature[feature["type"]], {}).get("source", "") ) + if feature["type"] == "regex": + return CapaExplorerFeatureItem(parent, display, location, details=feature["match"]) + if feature["type"] == "basicblock": return CapaExplorerBlockItem(parent, location) diff --git a/capa/render/__init__.py b/capa/render/__init__.py index dc44a244..0bc9e437 100644 --- a/capa/render/__init__.py +++ b/capa/render/__init__.py @@ -46,13 +46,6 @@ def convert_statement_to_result_document(statement): "max": statement.max, "child": convert_feature_to_result_document(statement.child), } - elif isinstance(statement, capa.engine.Regex): - return { - "type": "regex", - "pattern": statement.pattern, - # the string that was matched - "match": statement.match, - } elif isinstance(statement, capa.engine.Subscope): return { "type": "subscope", @@ -90,7 +83,8 @@ def convert_feature_to_result_document(feature): result = {"type": feature.name, feature.name: feature.get_value_str()} if feature.description: result["description"] = feature.description - + if feature.name == "regex": + result["match"] = feature.match return result diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 09b05245..6ead988c 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -70,11 +70,6 @@ def render_statement(ostream, match, statement, indent=0): elif statement["type"] == "subscope": ostream.write(statement["subscope"]) ostream.writeln(":") - elif statement["type"] == "regex": - # regex is a `Statement` not a `Feature` - # this is because it doesn't get extracted, but applies to all strings in scope. - # so we have to handle it here - ostream.writeln("string: %s" % (statement["match"])) else: raise RuntimeError("unexpected match statement type: " + str(statement)) @@ -82,11 +77,17 @@ def render_statement(ostream, match, statement, indent=0): def render_feature(ostream, match, feature, indent=0): ostream.write(" " * indent) - ostream.write(feature["type"]) + key = feature["type"] + value = feature[feature["type"]] + if key == "regex": + key = "string" # render string for regex to mirror the rule source + value = feature["match"] # the match provides more information than the value for regex + + ostream.write(key) ostream.write(": ") - if feature[feature["type"]]: - ostream.write(rutils.bold2(feature[feature["type"]])) + if value: + ostream.write(rutils.bold2(value)) if "description" in feature: ostream.write(capa.rules.DESCRIPTION_SEPARATOR) diff --git a/capa/rules.py b/capa/rules.py index db66287a..e1b4623e 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -184,7 +184,7 @@ def parse_feature(key): if key == "api": return capa.features.insn.API elif key == "string": - return capa.features.String + return capa.features.StringFactory elif key == "bytes": return capa.features.Bytes elif key == "number": @@ -348,19 +348,13 @@ def build_statements(d, scope): raise InvalidRule("unexpected range: %s" % (count)) elif key == "string" and not isinstance(d[key], six.string_types): raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key]) - elif key == "string" and d[key].startswith("/") and (d[key].endswith("/") or d[key].endswith("/i")): - try: - return Regex(d[key]) - except re.error: - if d[key].endswith("/i"): - d[key] = d[key][: -len("i")] - raise InvalidRule( - "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % d[key] - ) else: Feature = parse_feature(key) value, description = parse_description(d[key], key, d.get("description")) - feature = Feature(value, description) + try: + feature = Feature(value, description) + except ValueError as e: + raise InvalidRule(str(e)) ensure_feature_valid_for_scope(scope, feature) return feature diff --git a/tests/test_rules.py b/tests/test_rules.py index c4e5d409..6e9bfa2a 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -74,12 +74,22 @@ def test_rule_yaml_descriptions(): - number: 1 = This is the number 1 - string: This program cannot be run in DOS mode. description: MS-DOS stub message + - string: '/SELECT.*FROM.*WHERE/i' + description: SQL WHERE Clause - count(number(2 = AF_INET/SOCK_DGRAM)): 2 """ ) r = capa.rules.Rule.from_yaml(rule) assert ( - r.evaluate({Number(1): {1}, Number(2): {2, 3}, String("This program cannot be run in DOS mode."): {4}}) == True + r.evaluate( + { + Number(1): {1}, + Number(2): {2, 3}, + String("This program cannot be run in DOS mode."): {4}, + String("SELECT password FROM hidden_table WHERE user == admin"): {5}, + } + ) + == True )