mirror of
https://github.com/mandiant/capa.git
synced 2025-12-12 15:49:46 -08:00
Merge pull request #593 from fireeye/feature-159
json: capture all strings matching regex
This commit is contained in:
@@ -112,6 +112,7 @@ It includes many new rules, including all new techniques introduced in MITRE ATT
|
||||
- meta: added `library_functions` field, `feature_counts.functions` does not include library functions any more #562 @mr-tz
|
||||
- linter: check for `or` with always true child statement, e.g. `optional`, colors #348 @mr-tz
|
||||
- json: breaking change in results document; now contains parsed MBC fields instead of canonical representation #526 @mr-tz
|
||||
- json: breaking change: record all matching strings for regex #159 @williballenthin
|
||||
|
||||
### Development
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
import re
|
||||
import codecs
|
||||
import logging
|
||||
import collections
|
||||
|
||||
import capa.engine
|
||||
import capa.features
|
||||
@@ -155,6 +156,10 @@ class Regex(String):
|
||||
)
|
||||
|
||||
def evaluate(self, ctx):
|
||||
# mapping from string value to list of locations.
|
||||
# will unique the locations later on.
|
||||
matches = collections.defaultdict(list)
|
||||
|
||||
for feature, locations in ctx.items():
|
||||
if not isinstance(feature, (capa.features.String,)):
|
||||
continue
|
||||
@@ -164,13 +169,26 @@ class Regex(String):
|
||||
# using this mode cleans is more convenient for rule authors,
|
||||
# so that they don't have to prefix/suffix their terms like: /.*foo.*/.
|
||||
if self.re.search(feature.value):
|
||||
# unlike other features, we cannot return put a reference to `self` directly in a `Result`.
|
||||
# this is because `self` may match on many strings, so we can't stuff the matched value into it.
|
||||
# instead, return a new instance that has a reference to both the regex and the matched value.
|
||||
# see #262.
|
||||
return capa.engine.Result(True, _MatchedRegex(self, feature.value), [], locations=locations)
|
||||
matches[feature.value].extend(locations)
|
||||
|
||||
return capa.engine.Result(False, _MatchedRegex(self, None), [])
|
||||
if matches:
|
||||
# finalize: defaultdict -> dict
|
||||
# which makes json serialization easier
|
||||
matches = dict(matches)
|
||||
|
||||
# collect all locations
|
||||
locations = set()
|
||||
for s in matches.keys():
|
||||
matches[s] = list(set(matches[s]))
|
||||
locations.update(matches[s])
|
||||
|
||||
# unlike other features, we cannot return put a reference to `self` directly in a `Result`.
|
||||
# this is because `self` may match on many strings, so we can't stuff the matched value into it.
|
||||
# instead, return a new instance that has a reference to both the regex and the matched values.
|
||||
# see #262.
|
||||
return capa.engine.Result(True, _MatchedRegex(self, matches), [], locations=locations)
|
||||
else:
|
||||
return capa.engine.Result(False, _MatchedRegex(self, None), [])
|
||||
|
||||
def __str__(self):
|
||||
return "regex(string =~ %s)" % self.value
|
||||
@@ -178,27 +196,30 @@ class Regex(String):
|
||||
|
||||
class _MatchedRegex(Regex):
|
||||
"""
|
||||
this represents a specific instance of a regular expression feature match.
|
||||
treat it the same as a `Regex` except it has the `match` field that contains the complete string that matched.
|
||||
this represents specific match instances of a regular expression feature.
|
||||
treat it the same as a `Regex` except it has the `matches` field that contains the complete strings that matched.
|
||||
|
||||
note: this type should only ever be constructed by `Regex.evaluate()`. it is not part of the public API.
|
||||
"""
|
||||
|
||||
def __init__(self, regex, match):
|
||||
def __init__(self, regex, matches):
|
||||
"""
|
||||
args:
|
||||
regex (Regex): the regex feature that matches
|
||||
match (string|None): the matching string or None if it doesn't match
|
||||
regex (Regex): the regex feature that matches.
|
||||
match (Dict[string, List[int]]|None): mapping from matching string to its locations.
|
||||
"""
|
||||
super(_MatchedRegex, self).__init__(regex.value, description=regex.description)
|
||||
# we want this to collide with the name of `Regex` above,
|
||||
# so that it works nicely with the renderers.
|
||||
self.name = "regex"
|
||||
# this may be None if the regex doesn't match
|
||||
self.match = match
|
||||
self.matches = matches
|
||||
|
||||
def __str__(self):
|
||||
return 'regex(string =~ %s, matched = "%s")' % (self.value, self.match)
|
||||
return "regex(string =~ %s, matches = %s)" % (
|
||||
self.value,
|
||||
", ".join(map(lambda s: '"' + s + '"', (self.matches or {}).keys())),
|
||||
)
|
||||
|
||||
|
||||
class StringFactory(object):
|
||||
|
||||
@@ -12,6 +12,8 @@ import idc
|
||||
import idaapi
|
||||
from PyQt5 import QtGui, QtCore
|
||||
|
||||
import capa.rules
|
||||
import capa.features
|
||||
import capa.ida.helpers
|
||||
import capa.render.utils as rutils
|
||||
from capa.ida.plugin.item import (
|
||||
@@ -555,9 +557,14 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
|
||||
)
|
||||
|
||||
if feature["type"] == "regex":
|
||||
return CapaExplorerStringViewItem(
|
||||
parent, display, location, '"%s"' % capa.features.escape_string(feature["match"])
|
||||
)
|
||||
for s, locations in feature["matches"].items():
|
||||
if location in locations:
|
||||
return CapaExplorerStringViewItem(
|
||||
parent, display, location, '"' + capa.features.escape_string(s) + '"'
|
||||
)
|
||||
|
||||
# programming error: the given location should always be found in the regex matches
|
||||
raise ValueError("regex match at location not found")
|
||||
|
||||
if feature["type"] == "basicblock":
|
||||
return CapaExplorerBlockItem(parent, location)
|
||||
|
||||
@@ -72,7 +72,7 @@ def convert_feature_to_result_document(feature):
|
||||
if feature.description:
|
||||
result["description"] = feature.description
|
||||
if feature.name == "regex":
|
||||
result["match"] = feature.match
|
||||
result["matches"] = feature.matches
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -6,11 +6,10 @@
|
||||
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and limitations under the License.
|
||||
|
||||
import collections
|
||||
|
||||
import tabulate
|
||||
|
||||
import capa.rules
|
||||
import capa.features
|
||||
import capa.render.utils as rutils
|
||||
import capa.render.verbose
|
||||
|
||||
@@ -85,30 +84,50 @@ def render_statement(ostream, match, statement, indent=0):
|
||||
raise RuntimeError("unexpected match statement type: " + str(statement))
|
||||
|
||||
|
||||
def render_string_value(s):
|
||||
return '"%s"' % capa.features.escape_string(s)
|
||||
|
||||
|
||||
def render_feature(ostream, match, feature, indent=0):
|
||||
ostream.write(" " * indent)
|
||||
|
||||
key = feature["type"]
|
||||
value = feature[feature["type"]]
|
||||
if key == "regex":
|
||||
key = "string" # render string for regex to mirror the rule source
|
||||
value = feature["match"] # the match provides more information than the value for regex
|
||||
|
||||
if key == "string":
|
||||
value = '"%s"' % capa.features.escape_string(value)
|
||||
if key != "regex":
|
||||
# like:
|
||||
# number: 10 = SOME_CONSTANT @ 0x401000
|
||||
if key == "string":
|
||||
value = render_string_value(value)
|
||||
|
||||
ostream.write(key)
|
||||
ostream.write(": ")
|
||||
ostream.write(key)
|
||||
ostream.write(": ")
|
||||
|
||||
if value:
|
||||
ostream.write(rutils.bold2(value))
|
||||
if value:
|
||||
ostream.write(rutils.bold2(value))
|
||||
|
||||
if "description" in feature:
|
||||
ostream.write(capa.rules.DESCRIPTION_SEPARATOR)
|
||||
ostream.write(feature["description"])
|
||||
if "description" in feature:
|
||||
ostream.write(capa.rules.DESCRIPTION_SEPARATOR)
|
||||
ostream.write(feature["description"])
|
||||
|
||||
render_locations(ostream, match)
|
||||
ostream.write("\n")
|
||||
render_locations(ostream, match)
|
||||
ostream.write("\n")
|
||||
else:
|
||||
# like:
|
||||
# regex: /blah/ = SOME_CONSTANT
|
||||
# - "foo blah baz" @ 0x401000
|
||||
# - "aaa blah bbb" @ 0x402000, 0x403400
|
||||
ostream.write(key)
|
||||
ostream.write(": ")
|
||||
ostream.write(value)
|
||||
ostream.write("\n")
|
||||
|
||||
for match, locations in sorted(feature["matches"].items(), key=lambda p: p[0]):
|
||||
ostream.write(" " * (indent + 1))
|
||||
ostream.write("- ")
|
||||
ostream.write(rutils.bold2(render_string_value(match)))
|
||||
render_locations(ostream, {"locations": locations})
|
||||
ostream.write("\n")
|
||||
|
||||
|
||||
def render_node(ostream, match, node, indent=0):
|
||||
|
||||
Reference in New Issue
Block a user