diff --git a/README.md b/README.md index b8647373..318b6ba6 100644 --- a/README.md +++ b/README.md @@ -317,6 +317,25 @@ These are the features supported at the function-scope: - [mnemonic](#mnemonic) - [characteristics](#characteristics) +All of them support an optional description which helps with documenting rules and provides context in capa's output. +It can be specified in the following way: + +``` +- string: This program cannot be run in DOS mode. + description: MS-DOS stub message +- number: 0x4550 + description: IMAGE_DOS_SIGNATURE (MZ) +``` + +For all features except for [string](#string), the description can be specified inline preceded by ` = `. +For the previous [number](#number) example: + +``` +- number: 0x4550 = IMAGE_DOS_SIGNATURE (MZ) +``` + +The inline syntax is preferred (except for [string](#string)). + ### api A call to a named function, probably an import, though possibly a local function (like `malloc`) extracted via FLIRT. @@ -339,8 +358,8 @@ For example, a crypto constant. The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format. -To associate context with a number, e.g. for constant definitions, append an equal sign and the respective name to -the number definition. This helps with documenting rules and provides context in capa's output. +It can include an optional description, e.g. for constant definitions. +The inline syntax is preferred (` = DESCRIPTION STRING`). Examples: @@ -362,20 +381,29 @@ Regexes should be surrounded with `/` characters. By default, capa uses case-sensitive matching and assumes leading and trailing wildcards. To perform case-insensitive matching append an `i`. To anchor the regex at the start or end of a string, use `^` and/or `$`. +Strings can include a description, but the inline syntax is not supported. + Examples: - string: This program cannot be run in DOS mode. - string: Firefox 64.0 - string: /SELECT.*FROM.*WHERE/ - string: /Hardware\\Description\\System\\CentralProcessor/i - +``` +- string: This program cannot be run in DOS mode. + description: MS-DOS stub message +- string: '{3E5FC7F9-9A51-4367-9063-A120244FBEC7}' + description: CLSID_CMSTPLUA +- string: Firefox 64.0 +- string:'/SELECT.*FROM.*WHERE/ +- string: /Hardware\\Description\\System\\CentralProcessor/i +``` + Note that regex matching is expensive (`O(features)` rather than `O(1)`) so they should be used sparingly. ### bytes A sequence of bytes referenced by the logic of the program. The provided sequence must match from the beginning of the referenced bytes and be no more than `0x100` bytes. -The parameter is a sequence of hexadecimal bytes followed by an optional description. - +The parameter is a sequence of hexadecimal bytes. +It can include an optional description. +The inline syntax is preferred (` = DESCRIPTION STRING`). + The example below illustrates byte matching given a COM CLSID pushed onto the stack prior to `CoCreateInstance`. @@ -397,6 +425,7 @@ A structure offset referenced by the logic of the program. This should not be a stack offset. The parameter is a number; if prefixed with `0x` then in hex format, otherwise, decimal format. +It can be followed by an optional description. Examples: @@ -453,6 +482,8 @@ These are the features supported at the file-scope: - [import](#import) - [section](#section) +All of them can be followed by an optional description, as the features in the previous section. + ### file string An ASCII or UTF-16 LE string present in the file. diff --git a/capa/features/__init__.py b/capa/features/__init__.py index 9fec2d76..182cd514 100644 --- a/capa/features/__init__.py +++ b/capa/features/__init__.py @@ -17,10 +17,11 @@ def bytes_to_str(b): class Feature(object): - def __init__(self, args): + def __init__(self, args, description=None): super(Feature, self).__init__() self.name = self.__class__.__name__ self.args = args + self.description = description def __hash__(self): return hash((self.name, tuple(self.args))) @@ -28,8 +29,17 @@ class Feature(object): def __eq__(self, other): return self.name == other.name and self.args == other.args + def _str_name(self): + return self.name.lower() + + def _str_value(self): + return ','.join(self.args) + def __str__(self): - return '%s(%s)' % (self.name.lower(), ','.join(self.args)) + if self.description: + return '%s(%s = %s)' % (self._str_name(), self._str_value(), self.description) + else: + return '%s(%s)' % (self._str_name(), self._str_value()) def __repr__(self): return str(self) @@ -50,22 +60,22 @@ class Feature(object): class MatchedRule(Feature): - def __init__(self, rule_name): - super(MatchedRule, self).__init__([rule_name]) + def __init__(self, rule_name, description=None): + super(MatchedRule, self).__init__([rule_name], description) self.rule_name = rule_name - def __str__(self): - return 'match(%s)' % (self.rule_name) + def _str_name(self): + return 'match' class Characteristic(Feature): - def __init__(self, name, value=None): + def __init__(self, name, value=None, description=None): ''' when `value` is not provided, this serves as descriptor for a class of characteristics. this is only used internally, such as in `rules.py` when checking if a statement is supported by a given scope. ''' - super(Characteristic, self).__init__([name, value]) + super(Characteristic, self).__init__([name, value], description) self.name = name self.value = value @@ -74,27 +84,23 @@ class Characteristic(Feature): raise ValueError('cannot evaluate characteristc %s with empty value' % (str(self))) return super(Characteristic, self).evaluate(ctx) - def __str__(self): + def _str_value(self): if self.value is None: - return 'characteristic(%s)' % (self.name) + return self.name else: - return 'characteristic(%s(%s))' % (self.name, self.value) + return '%s(%s)' % (self.name, self.value) class String(Feature): - def __init__(self, value): - super(String, self).__init__([value]) + def __init__(self, value, description=None): + super(String, self).__init__([value], description) self.value = value - def __str__(self): - return 'string("%s")' % (self.value) - class Bytes(Feature): - def __init__(self, value, symbol=None): - super(Bytes, self).__init__([value]) + def __init__(self, value, description=None): + super(Bytes, self).__init__([value], description) self.value = value - self.symbol = symbol def evaluate(self, ctx): for feature, locations in ctx.items(): @@ -106,11 +112,8 @@ class Bytes(Feature): return capa.engine.Result(False, self, []) - def __str__(self): - if self.symbol: - return 'bytes(0x%s = %s)' % (bytes_to_str(self.value).upper(), self.symbol) - else: - return 'bytes(0x%s)' % (bytes_to_str(self.value).upper()) + def _str_value(self): + return '0x%s' % bytes_to_str(self.value).upper() def freeze_serialize(self): return (self.__class__.__name__, diff --git a/capa/features/file.py b/capa/features/file.py index 708b8e2b..396edd1f 100644 --- a/capa/features/file.py +++ b/capa/features/file.py @@ -2,30 +2,21 @@ from capa.features import Feature class Export(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is export name - super(Export, self).__init__([value]) + super(Export, self).__init__([value], description) self.value = value - def __str__(self): - return 'Export(%s)' % (self.value) - class Import(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is import name - super(Import, self).__init__([value]) + super(Import, self).__init__([value], description) self.value = value - def __str__(self): - return 'Import(%s)' % (self.value) - class Section(Feature): - def __init__(self, value): + def __init__(self, value, description=None): # value is section name - super(Section, self).__init__([value]) + super(Section, self).__init__([value], description) self.value = value - - def __str__(self): - return 'Section(%s)' % (self.value) diff --git a/capa/features/insn.py b/capa/features/insn.py index b8ebf9da..122bd0ae 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -2,45 +2,34 @@ from capa.features import Feature class API(Feature): - def __init__(self, name): + def __init__(self, name, description=None): # Downcase library name if given if '.' in name: modname, impname = name.split('.') name = modname.lower() + '.' + impname - super(API, self).__init__([name]) + super(API, self).__init__([name], description) class Number(Feature): - def __init__(self, value, symbol=None): - super(Number, self).__init__([value]) + def __init__(self, value, description=None): + super(Number, self).__init__([value], description) self.value = value - self.symbol = symbol - def __str__(self): - if self.symbol: - return 'number(0x%x = %s)' % (self.value, self.symbol) - else: - return 'number(0x%x)' % (self.value) + def _str_value(self): + return '0x%x' % self.value class Offset(Feature): - def __init__(self, value, symbol=None): + def __init__(self, value, description=None): super(Offset, self).__init__([value]) self.value = value - self.symbol = symbol - def __str__(self): - if self.symbol: - return 'offset(0x%x = %s)' % (self.value, self.symbol) - else: - return 'offset(0x%x)' % (self.value) + def _str_value(self): + return '0x%x' % self.value class Mnemonic(Feature): - def __init__(self, value): - super(Mnemonic, self).__init__([value]) + def __init__(self, value, description=None): + super(Mnemonic, self).__init__([value], description) self.value = value - - def __str__(self): - return 'mnemonic(%s)' % (self.value) diff --git a/capa/rules.py b/capa/rules.py index 8e42d0fc..7e2ffd6a 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -207,7 +207,7 @@ def parse_feature(key): return capa.features.basicblock.BasicBlock elif key.startswith('characteristic(') and key.endswith(')'): characteristic = key[len('characteristic('):-len(')')] - return lambda v: capa.features.Characteristic(characteristic, v) + return lambda v, description=None: capa.features.Characteristic(characteristic, v, description) elif key == 'export': return capa.features.file.Export elif key == 'import': @@ -220,18 +220,18 @@ def parse_feature(key): raise InvalidRule('unexpected statement: %s' % key) -def parse_symbol(s, value_type): +def parse_description(s, value_type, description=None): ''' s can be an int or a string ''' - if isinstance(s, str) and '=' in s: - value, symbol = s.split('=', 1) - symbol = symbol.strip() - if symbol == '': - raise InvalidRule('unexpected value: "%s", symbol name cannot be empty' % s) + if value_type != 'string' and isinstance(s, str) and ' = ' in s: + if description: + raise InvalidRule('unexpected value: "%s", only one description allowed (inline description with `=`)' % s) + value, description = s.split(' = ', 1) + if description == '': + raise InvalidRule('unexpected value: "%s", description cannot be empty' % s) else: value = s - symbol = None if isinstance(value, str): if value_type == 'bytes': @@ -244,17 +244,17 @@ def parse_symbol(s, value_type): if len(value) > MAX_BYTES_FEATURE_SIZE: raise InvalidRule('unexpected bytes value: byte sequences must be no larger than %s bytes' % MAX_BYTES_FEATURE_SIZE) - else: + elif value_type in ['number', 'offset']: try: value = parse_int(value) except ValueError: raise InvalidRule('unexpected value: "%s", must begin with numerical value' % value) - return value, symbol + return value, description def build_statements(d, scope): - if len(d.keys()) != 1: + if len(d.keys()) > 2: raise InvalidRule('too many statements') key = list(d.keys())[0] @@ -330,10 +330,10 @@ def build_statements(d, scope): # # count(offset(0xC)) # count(number(0x11223344)) - # count(number(0x100 = symbol name)) + # count(number(0x100 = description)) if term in ('number', 'offset', 'bytes'): - value, symbol = parse_symbol(arg, term) - feature = Feature(value, symbol) + value, description = parse_description(arg, term) + feature = Feature(value, description) else: # arg is string, like: # @@ -370,13 +370,8 @@ def build_statements(d, scope): raise InvalidRule('invalid regular expression: %s it should use Python syntax, try it at https://pythex.org' % d[key]) else: Feature = parse_feature(key) - if key in ('number', 'offset', 'bytes'): - # parse numbers with symbol description, e.g. 0x4550 = IMAGE_DOS_SIGNATURE - # or regular numbers, e.g. 37 - value, symbol = parse_symbol(d[key], key) - feature = Feature(value, symbol) - else: - feature = Feature(d[key]) + value, symbol = parse_description(d[key], key, d.get('description')) + feature = Feature(value, symbol) ensure_feature_valid_for_scope(scope, feature) return feature