From 6fe56f62243f26478605f90a7478318691464c73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Wed, 15 Jul 2020 18:44:22 +0200 Subject: [PATCH 1/4] Make Regex a Feature This enables description for regular expressions and simplifies the code. --- capa/engine.py | 34 ---------------------------- capa/features/__init__.py | 45 ++++++++++++++++++++++++++++++++++++++ capa/ida/explorer/model.py | 14 ++++++------ capa/render/__init__.py | 10 ++------- capa/render/vverbose.py | 17 +++++++------- capa/rules.py | 16 +++++--------- 6 files changed, 68 insertions(+), 68 deletions(-) diff --git a/capa/engine.py b/capa/engine.py index 34bb5cc7..8e5f0a3c 100644 --- a/capa/engine.py +++ b/capa/engine.py @@ -1,6 +1,5 @@ # Copyright (C) 2020 FireEye, Inc. All Rights Reserved. -import re import sys import copy import collections @@ -176,39 +175,6 @@ class Range(Statement): return "range(%s, min=%d, max=%d)" % (str(self.child), self.min, self.max) -class Regex(Statement): - """match if the given pattern matches a String feature.""" - - def __init__(self, pattern): - super(Regex, self).__init__() - self.pattern = pattern - pat = self.pattern[len("/") : -len("/")] - flags = re.DOTALL - if pattern.endswith("/i"): - pat = self.pattern[len("/") : -len("/i")] - flags |= re.IGNORECASE - self.re = re.compile(pat, flags) - self.match = "" - - def evaluate(self, ctx): - for feature, locations in ctx.items(): - if not isinstance(feature, (capa.features.String,)): - continue - - # `re.search` finds a match anywhere in the given string - # which implies leading and/or trailing whitespace. - # using this mode cleans is more convenient for rule authors, - # so that they don't have to prefix/suffix their terms like: /.*foo.*/. - if self.re.search(feature.value): - self.match = feature.value - return Result(True, self, [], locations=locations) - - return Result(False, self, []) - - def __str__(self): - return 'regex(string =~ %s, matched = "%s")' % (self.pattern, self.match) - - class Subscope(Statement): """ a subscope element is a placeholder in a rule - it should not be evaluated directly. diff --git a/capa/features/__init__.py b/capa/features/__init__.py index e72daffd..9d18b8bd 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -1,5 +1,6 @@ # Copyright (C) 2020 FireEye, Inc. All Rights Reserved. +import re import sys import codecs import logging @@ -82,6 +83,50 @@ class String(Feature): super(String, self).__init__(value, description) +class Regex(String): + def __init__(self, value, description=None): + super(Regex, self).__init__(value, description) + pat = self.value[len("/") : -len("/")] + flags = re.DOTALL + if value.endswith("/i"): + pat = self.value[len("/") : -len("/i")] + flags |= re.IGNORECASE + try: + self.re = re.compile(pat, flags) + except re.error: + if value.endswith("/i"): + value = value[: -len("i")] + raise ValueError( + "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value + ) + self.match = "" + + def evaluate(self, ctx): + for feature, locations in ctx.items(): + if not isinstance(feature, (capa.features.String,)): + continue + + # `re.search` finds a match anywhere in the given string + # which implies leading and/or trailing whitespace. + # using this mode cleans is more convenient for rule authors, + # so that they don't have to prefix/suffix their terms like: /.*foo.*/. + if self.re.search(feature.value): + self.match = feature.value + return capa.engine.Result(True, self, [], locations=locations) + + return capa.engine.Result(False, self, []) + + def __str__(self): + return 'regex(string =~ %s, matched = "%s")' % (self.value, self.match) + + +class StringFactory(object): + def __new__(self, value, description): + if value.startswith("/") and (value.endswith("/") or value.endswith("/i")): + return Regex(value, description) + return String(value, description) + + class Bytes(Feature): def __init__(self, value, description=None): super(Bytes, self).__init__(value, description) diff --git a/capa/ida/explorer/model.py b/capa/ida/explorer/model.py index 93949d16..27459b09 100644 --- a/capa/ida/explorer/model.py +++ b/capa/ida/explorer/model.py @@ -373,11 +373,6 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): return parent2 elif statement["type"] == "subscope": return CapaExplorerSubscopeItem(parent, statement[statement["type"]]) - elif statement["type"] == "regex": - # regex is a `Statement` not a `Feature` - # this is because it doesn't get extracted, but applies to all strings in scope. - # so we have to handle it here - return CapaExplorerFeatureItem(parent, "regex(%s)" % statement["pattern"], details=statement["match"]) else: raise RuntimeError("unexpected match statement type: " + str(statement)) @@ -493,13 +488,18 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): } """ display = self.capa_doc_feature_to_display(feature) + details = "" + if feature["type"] == "regex": + details = feature["match"] if len(locations) == 1: # only one location for feature so no need to nest children - parent2 = self.render_capa_doc_feature(parent, feature, next(iter(locations)), doc, display=display) + parent2 = self.render_capa_doc_feature( + parent, feature, next(iter(locations)), doc, display=display, details=details + ) else: # feature has multiple children, nest under one parent feature node - parent2 = CapaExplorerFeatureItem(parent, display) + parent2 = CapaExplorerFeatureItem(parent, display, details=details) for location in sorted(locations): self.render_capa_doc_feature(parent2, feature, location, doc) diff --git a/capa/render/__init__.py b/capa/render/__init__.py index dc44a244..0bc9e437 100644 --- a/capa/render/__init__.py +++ b/capa/render/__init__.py @@ -46,13 +46,6 @@ def convert_statement_to_result_document(statement): "max": statement.max, "child": convert_feature_to_result_document(statement.child), } - elif isinstance(statement, capa.engine.Regex): - return { - "type": "regex", - "pattern": statement.pattern, - # the string that was matched - "match": statement.match, - } elif isinstance(statement, capa.engine.Subscope): return { "type": "subscope", @@ -90,7 +83,8 @@ def convert_feature_to_result_document(feature): result = {"type": feature.name, feature.name: feature.get_value_str()} if feature.description: result["description"] = feature.description - + if feature.name == "regex": + result["match"] = feature.match return result diff --git a/capa/render/vverbose.py b/capa/render/vverbose.py index 09b05245..6ead988c 100644 --- a/capa/render/vverbose.py +++ b/capa/render/vverbose.py @@ -70,11 +70,6 @@ def render_statement(ostream, match, statement, indent=0): elif statement["type"] == "subscope": ostream.write(statement["subscope"]) ostream.writeln(":") - elif statement["type"] == "regex": - # regex is a `Statement` not a `Feature` - # this is because it doesn't get extracted, but applies to all strings in scope. - # so we have to handle it here - ostream.writeln("string: %s" % (statement["match"])) else: raise RuntimeError("unexpected match statement type: " + str(statement)) @@ -82,11 +77,17 @@ def render_statement(ostream, match, statement, indent=0): def render_feature(ostream, match, feature, indent=0): ostream.write(" " * indent) - ostream.write(feature["type"]) + key = feature["type"] + value = feature[feature["type"]] + if key == "regex": + key = "string" # render string for regex to mirror the rule source + value = feature["match"] # the match provides more information than the value for regex + + ostream.write(key) ostream.write(": ") - if feature[feature["type"]]: - ostream.write(rutils.bold2(feature[feature["type"]])) + if value: + ostream.write(rutils.bold2(value)) if "description" in feature: ostream.write(capa.rules.DESCRIPTION_SEPARATOR) diff --git a/capa/rules.py b/capa/rules.py index db66287a..e1b4623e 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -184,7 +184,7 @@ def parse_feature(key): if key == "api": return capa.features.insn.API elif key == "string": - return capa.features.String + return capa.features.StringFactory elif key == "bytes": return capa.features.Bytes elif key == "number": @@ -348,19 +348,13 @@ def build_statements(d, scope): raise InvalidRule("unexpected range: %s" % (count)) elif key == "string" and not isinstance(d[key], six.string_types): raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key]) - elif key == "string" and d[key].startswith("/") and (d[key].endswith("/") or d[key].endswith("/i")): - try: - return Regex(d[key]) - except re.error: - if d[key].endswith("/i"): - d[key] = d[key][: -len("i")] - raise InvalidRule( - "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % d[key] - ) else: Feature = parse_feature(key) value, description = parse_description(d[key], key, d.get("description")) - feature = Feature(value, description) + try: + feature = Feature(value, description) + except ValueError as e: + raise InvalidRule(str(e)) ensure_feature_valid_for_scope(scope, feature) return feature From 78dae308c2474ce91bffcc14573a6114ff55ce01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Wed, 15 Jul 2020 19:07:14 +0200 Subject: [PATCH 2/4] Add test for RegExp descriptions Now that RegExp are a feature, ensure that descriptions are working. --- tests/test_rules.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/tests/test_rules.py b/tests/test_rules.py index c4e5d409..6e9bfa2a 100644 --- a/tests/test_rules.py +++ b/tests/test_rules.py @@ -74,12 +74,22 @@ def test_rule_yaml_descriptions(): - number: 1 = This is the number 1 - string: This program cannot be run in DOS mode. description: MS-DOS stub message + - string: '/SELECT.*FROM.*WHERE/i' + description: SQL WHERE Clause - count(number(2 = AF_INET/SOCK_DGRAM)): 2 """ ) r = capa.rules.Rule.from_yaml(rule) assert ( - r.evaluate({Number(1): {1}, Number(2): {2, 3}, String("This program cannot be run in DOS mode."): {4}}) == True + r.evaluate( + { + Number(1): {1}, + Number(2): {2, 3}, + String("This program cannot be run in DOS mode."): {4}, + String("SELECT password FROM hidden_table WHERE user == admin"): {5}, + } + ) + == True ) From 67cfb3866c8d7790be77cb3495243595dce011d1 Mon Sep 17 00:00:00 2001 From: Michael Hunhoff Date: Wed, 15 Jul 2020 14:55:29 -0600 Subject: [PATCH 3/4] support regex feature capa explorer --- capa/ida/explorer/model.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/capa/ida/explorer/model.py b/capa/ida/explorer/model.py index 27459b09..26e387c5 100644 --- a/capa/ida/explorer/model.py +++ b/capa/ida/explorer/model.py @@ -488,18 +488,15 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): } """ display = self.capa_doc_feature_to_display(feature) - details = "" - if feature["type"] == "regex": - details = feature["match"] if len(locations) == 1: # only one location for feature so no need to nest children parent2 = self.render_capa_doc_feature( - parent, feature, next(iter(locations)), doc, display=display, details=details + parent, feature, next(iter(locations)), doc, display=display, ) else: # feature has multiple children, nest under one parent feature node - parent2 = CapaExplorerFeatureItem(parent, display, details=details) + parent2 = CapaExplorerFeatureItem(parent, display) for location in sorted(locations): self.render_capa_doc_feature(parent2, feature, location, doc) @@ -539,6 +536,9 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel): parent, display, source=doc["rules"].get(feature[feature["type"]], {}).get("source", "") ) + if feature["type"] == "regex": + return CapaExplorerFeatureItem(parent, display, location, details=feature["match"]) + if feature["type"] == "basicblock": return CapaExplorerBlockItem(parent, location) From 80ba19a4667ffae5bdf2ec00afbf016cdcb431f4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ana=20Mar=C3=ADa=20Mart=C3=ADnez=20G=C3=B3mez?= Date: Wed, 15 Jul 2020 23:02:06 +0200 Subject: [PATCH 4/4] Do not initialize Regex match It is not used until it has a value. --- capa/features/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/__init__.py b/capa/features/__init__.py index 9d18b8bd..40773a51 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -99,7 +99,7 @@ class Regex(String): raise ValueError( "invalid regular expression: %s it should use Python syntax, try it at https://pythex.org" % value ) - self.match = "" + self.match = None def evaluate(self, ctx): for feature, locations in ctx.items():