capa: extractors: sketch API extension to support function id

This commit is contained in:
William Ballenthin
2021-02-25 12:20:29 -07:00
parent fa7d58d01a
commit 1b2c8880ee
2 changed files with 48 additions and 4 deletions

View File

@@ -76,6 +76,41 @@ class FeatureExtractor(object):
"""
raise NotImplemented
def is_library_function(self, va):
"""
is the given address a library function?
the backend may implement its own function matching algorithm, or none at all.
we accept a VA here, rather than function object, to handle addresses identified in instructions.
this information is used to:
- filter out matches in library functions (by default), and
- recognize when to fetch symbol names for called (non-API) functions
args:
va [int]: the virtual address of a function.
returns:
bool: True if the given address is the start of a library function.
"""
return False
def get_function_name(self, va):
"""
fetch any recognized name for the given address.
this is only guaranteed to return a value when the given function is a recognized library function.
we accept a VA here, rather than function object, to handle addresses identified in instructions.
args:
va [int]: the virtual address of a function.
returns:
str: the function name
raises:
KeyError: when the given function does not have a name.
"""
raise KeyError(va)
@abc.abstractmethod
def extract_function_features(self, f):
"""

View File

@@ -123,10 +123,19 @@ def find_capabilities(ruleset, extractor, disable_progress=None):
# to disable progress completely
pbar = lambda s, *args, **kwargs: s
for f in pbar(list(extractor.get_functions()), desc="matching", unit=" functions"):
functions = list(extractor.get_functions())
for f in pbar(functions, desc="matching", unit=" functions"):
function_address = f.__int__()
if extractor.is_library_function(function_address):
function_name = extractor.get_function_name(function_address)
logger.debug("skipping library function 0x%x (%s)", function_address, function_name)
continue
function_matches, bb_matches, feature_count = find_function_capabilities(ruleset, extractor, f)
meta["feature_counts"]["functions"][f.__int__()] = feature_count
logger.debug("analyzed function 0x%x and extracted %d features", f.__int__(), feature_count)
meta["feature_counts"]["functions"][function_address] = feature_count
logger.debug("analyzed function 0x%x and extracted %d features", function_address, feature_count)
for rule_name, res in function_matches.items():
all_function_matches[rule_name].extend(res)
@@ -134,7 +143,7 @@ def find_capabilities(ruleset, extractor, disable_progress=None):
all_bb_matches[rule_name].extend(res)
# mapping from matched rule feature to set of addresses at which it matched.
# schema: Dic[MatchedRule: Set[int]
# schema: Dict[MatchedRule: Set[int]
function_features = {
capa.features.MatchedRule(rule_name): set(map(lambda p: p[0], results))
for rule_name, results in all_function_matches.items()