Merge branch 'dotnet-main' of github.com:mandiant/capa into feature-981

This commit is contained in:
Willi Ballenthin
2022-04-08 12:17:16 -06:00
10 changed files with 163 additions and 20 deletions

View File

@@ -12,6 +12,7 @@
### Breaking Changes
- instruction scope and operand feature are new and are not backwards compatible with older versions of capa
- Python 3.7 is now the minimum supported Python version #866 @williballenthin
- remove /x32 and /x64 flavors of number and operand features #932 @williballenthin
- the tool now accepts multiple paths to rules, and JSON doc updated accordingly @williballenthin

View File

@@ -22,6 +22,7 @@ class Address(abc.ABC):
class AbsoluteVirtualAddress(int, Address):
"""an absolute memory address"""
def __new__(cls, v):
assert v > 0
return int.__new__(cls, v)
@@ -35,6 +36,7 @@ class RelativeVirtualAddress(int, Address):
class FileOffsetAddress(int, Address):
"""an address relative to the start of a file"""
def __new__(cls, v):
assert v > 0
return int.__new__(cls, v)
@@ -42,6 +44,7 @@ class FileOffsetAddress(int, Address):
class DNTokenAddress(Token, Address):
"""a .NET token"""
pass

View File

@@ -382,3 +382,8 @@ def get_function_blocks(f):
def is_basic_block_return(bb):
"""check if basic block is return block"""
return bb.type == idaapi.fcb_ret
def has_sib(oper) -> bool:
# via: https://reverseengineering.stackexchange.com/a/14300
return oper.specflag1 == 1

View File

@@ -135,6 +135,15 @@ def extract_insn_number_features(f, bb, insn):
yield Number(const), insn.ea
yield OperandNumber(i, const), insn.ea
if insn.itype == idaapi.NN_add and 0 < const < MAX_STRUCTURE_SIZE and op.type == idaapi.o_imm:
# for pattern like:
#
# add eax, 0x10
#
# assume 0x10 is also an offset (imagine eax is a pointer).
yield Offset(const), insn.ea
yield OperandOffset(i, const), insn.ea
def extract_insn_bytes_features(f, bb, insn):
"""parse referenced byte sequences
@@ -209,6 +218,25 @@ def extract_insn_offset_features(f, bb, insn):
yield Offset(op_off), insn.ea
yield OperandOffset(i, op_off), insn.ea
if (
insn.itype == idaapi.NN_lea
and i == 1
# o_displ is used for both:
# [eax+1]
# [eax+ebx+2]
and op.type == idaapi.o_displ
# but the SIB is only present for [eax+ebx+2]
# which we don't want
and not capa.features.extractors.ida.helpers.has_sib(op)
):
# for pattern like:
#
# lea eax, [ebx + 1]
#
# assume 1 is also an offset (imagine ebx is a zero register).
yield Number(op_off), insn.ea
yield OperandNumber(i, op_off), insn.ea
def contains_stack_cookie_keywords(s):
"""check if string contains stack cookie keywords

View File

@@ -5,7 +5,7 @@ import struct
from smda.common.SmdaReport import SmdaReport
import capa.features.extractors.helpers
from capa.features.insn import API, Number, Offset, Mnemonic
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Characteristic
# security cookie checks may perform non-zeroing XORs, these are expected within a certain
@@ -64,15 +64,25 @@ def extract_insn_number_features(f, bb, insn):
# .text:00401140 call sub_407E2B
# .text:00401145 add esp, 0Ch
return
for operand in operands:
for i, operand in enumerate(operands):
try:
# The result of bitwise operations is calculated as though carried out
# in twos complement with an infinite number of sign bits
value = int(operand, 16) & ((1 << f.smda_report.bitness) - 1)
yield Number(value), insn.offset
except:
except ValueError:
continue
else:
yield Number(value), insn.offset
yield OperandNumber(i, value), insn.offset
if insn.mnemonic == "add" and 0 < value < MAX_STRUCTURE_SIZE:
# for pattern like:
#
# add eax, 0x10
#
# assume 0x10 is also an offset (imagine eax is a pointer).
yield Offset(value), insn.offset
yield OperandOffset(i, value), insn.offset
def read_bytes(smda_report, va, num_bytes=None):
@@ -198,11 +208,10 @@ def extract_insn_offset_features(f, bb, insn):
# mov eax, [esi + 4]
# mov eax, [esi + ecx + 16384]
operands = [o.strip() for o in insn.operands.split(",")]
for operand in operands:
if "ptr" not in operand:
continue
for i, operand in enumerate(operands):
if "esp" in operand or "ebp" in operand or "rbp" in operand:
continue
number = 0
number_hex = re.search(PATTERN_HEXNUM, operand)
number_int = re.search(PATTERN_SINGLENUM, operand)
@@ -212,7 +221,26 @@ def extract_insn_offset_features(f, bb, insn):
elif number_int:
number = int(number_int.group("num"))
number = -1 * number if number_int.group().startswith("-") else number
if "ptr" not in operand:
if (
insn.mnemonic == "lea"
and i == 1
and (operand.count("+") + operand.count("-")) == 1
and operand.count("*") == 0
):
# for pattern like:
#
# lea eax, [ebx + 1]
#
# assume 1 is also an offset (imagine ebx is a zero register).
yield Number(number), insn.offset
yield OperandNumber(i, number), insn.offset
continue
yield Offset(number), insn.offset
yield OperandOffset(i, number), insn.offset
def is_security_cookie(f, bb, insn):

View File

@@ -19,7 +19,7 @@ import envi.archs.amd64.disasm
import capa.features.extractors.helpers
import capa.features.extractors.viv.helpers
from capa.features.insn import API, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Feature, Characteristic
from capa.features.address import Address, AbsoluteVirtualAddress
from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle
@@ -579,6 +579,15 @@ def extract_op_number_features(
yield Number(v), ihandle.address
yield OperandNumber(i, v), ihandle.address
if insn.mnem == "add" and 0 < v < MAX_STRUCTURE_SIZE and isinstance(oper, envi.archs.i386.disasm.i386ImmOper):
# for pattern like:
#
# add eax, 0x10
#
# assume 0x10 is also an offset (imagine eax is a pointer).
yield Offset(v), insn.va
yield OperandOffset(i, v), insn.va
def extract_op_offset_features(f, bb, ihandle: InsnHandle, i, oper: envi.Operand) -> Iterator[Tuple[Feature, Address]]:
"""parse structure offset features from the given operand."""
@@ -608,6 +617,15 @@ def extract_op_offset_features(f, bb, ihandle: InsnHandle, i, oper: envi.Operand
yield Offset(v), ihandle.address
yield OperandOffset(i, v), ihandle.address
if insn.mnem == "lea" and i == 1 and not f.vw.probeMemory(v, 1, envi.memory.MM_READ):
# for pattern like:
#
# lea eax, [ebx + 1]
#
# assume 1 is also an offset (imagine ebx is a zero register).
yield Number(v), insn.va
yield OperandNumber(i, v), insn.va
# like: [esi + ecx + 16384]
# reg ^ ^
# index ^

View File

@@ -29,6 +29,10 @@ class Number(Feature):
return capa.render.utils.hex(self.value)
# max recognized structure size (and therefore, offset size)
MAX_STRUCTURE_SIZE = 0x10000
class Offset(Feature):
def __init__(self, value: int, description=None):
super(Offset, self).__init__(value, description=description)

View File

@@ -37,7 +37,8 @@ from capa.features.common import (
)
CD = os.path.dirname(__file__)
DNFILE_TESTFILES = "dnfile-testfiles"
DOTNET_DIR = os.path.join(CD, "data", "dotnet")
DNFILE_TESTFILES = os.path.join(DOTNET_DIR, "dnfile-testfiles")
@contextlib.contextmanager
@@ -181,6 +182,14 @@ def extract_basic_block_features(extractor, f, bb):
return features
# f may not be hashable (e.g. ida func_t) so cannot @lru_cache this
def extract_instruction_features(extractor, f, bb, insn):
features = collections.defaultdict(set)
for feature, va in extractor.extract_insn_features(f, bb, insn):
features[feature].add(va)
return features
# note: too reduce the testing time it's recommended to reuse already existing test samples, if possible
def get_data_path_by_name(name):
if name == "mimikatz":
@@ -234,7 +243,7 @@ def get_data_path_by_name(name):
elif name.startswith("b9f5b"):
return os.path.join(CD, "data", "b9f5bd514485fb06da39beff051b9fdc.exe_")
elif name.startswith("mixed-mode-64"):
return os.path.join(CD, "data", DNFILE_TESTFILES, "mixed-mode", "ModuleCode", "bin", "ModuleCode_amd64.exe")
return os.path.join(DNFILE_TESTFILES, "mixed-mode", "ModuleCode", "bin", "ModuleCode_amd64.exe")
else:
raise ValueError("unexpected sample fixture: %s" % name)
@@ -317,6 +326,13 @@ def get_basic_block(extractor, f, va):
raise ValueError("basic block not found")
def get_instruction(extractor, f, bb, va):
for insn in extractor.get_instructions(f, bb):
if int(insn) == va:
return insn
raise ValueError("instruction not found")
def resolve_scope(scope):
if scope == "file":
@@ -328,8 +344,32 @@ def resolve_scope(scope):
inner_file.__name__ = scope
return inner_file
elif "insn=" in scope:
# like `function=0x401000,bb=0x40100A,insn=0x40100A`
assert "function=" in scope
assert "bb=" in scope
assert "insn=" in scope
fspec, _, spec = scope.partition(",")
bbspec, _, ispec = spec.partition(",")
fva = int(fspec.partition("=")[2], 0x10)
bbva = int(bbspec.partition("=")[2], 0x10)
iva = int(ispec.partition("=")[2], 0x10)
def inner_insn(extractor):
f = get_function(extractor, fva)
bb = get_basic_block(extractor, f, bbva)
insn = get_instruction(extractor, f, bb, iva)
features = extract_instruction_features(extractor, f, bb, insn)
for k, vs in extract_global_features(extractor).items():
features[k].update(vs)
return features
inner_insn.__name__ = scope
return inner_insn
elif "bb=" in scope:
# like `function=0x401000,bb=0x40100A`
assert "function=" in scope
assert "bb=" in scope
fspec, _, bbspec = scope.partition(",")
fva = int(fspec.partition("=")[2], 0x10)
bbva = int(bbspec.partition("=")[2], 0x10)
@@ -459,6 +499,30 @@ FEATURE_PRESENCE_TESTS = sorted(
# insn/offset: negative
("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True),
("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True),
#
# insn/offset from mnemonic: add
#
# should not be considered, too big for an offset:
# .text:00401D85 81 C1 00 00 00 80 add ecx, 80000000h
("mimikatz", "function=0x401D64,bb=0x401D73,insn=0x401D85", capa.features.insn.Offset(0x80000000), False),
# should not be considered, relative to stack:
# .text:00401CF6 83 C4 10 add esp, 10h
("mimikatz", "function=0x401CC7,bb=0x401CDE,insn=0x401CF6", capa.features.insn.Offset(0x10), False),
# yes, this is also a offset (imagine eax is a pointer):
# .text:0040223C 83 C0 04 add eax, 4
("mimikatz", "function=0x402203,bb=0x402221,insn=0x40223C", capa.features.insn.Offset(0x4), True),
#
# insn/number from mnemonic: lea
#
# should not be considered, lea operand invalid encoding
# .text:00471EE6 8D 1C 81 lea ebx, [ecx+eax*4]
("mimikatz", "function=0x471EAB,bb=0x471ED8,insn=0x471EE6", capa.features.insn.Number(0x4), False),
# should not be considered, lea operand invalid encoding
# .text:004717B1 8D 4C 31 D0 lea ecx, [ecx+esi-30h]
("mimikatz", "function=0x47153B,bb=0x4717AB,insn=0x4717B1", capa.features.insn.Number(-0x30), False),
# yes, this is also a number (imagine edx is zero):
# .text:004018C0 8D 4B 02 lea ecx, [ebx+2]
("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True),
# insn/api
("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContextW"), True),
("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContext"), True),

View File

@@ -22,14 +22,6 @@ def test_smda_features(sample, scope, feature, expected):
if scope.__name__ == "file" and isinstance(feature, capa.features.file.FunctionName) and expected is True:
pytest.xfail("SMDA has no function ID")
if "bb=" in scope.__name__ and isinstance(feature, capa.features.insn.OperandNumber) and expected is True:
# SMDA not currently maintained, see: https://github.com/mandiant/capa/issues/937
pytest.xfail("SMDA doesn't support operand numbers")
if "bb=" in scope.__name__ and isinstance(feature, capa.features.insn.OperandOffset) and expected is True:
# SMDA not currently maintained, see: https://github.com/mandiant/capa/issues/937
pytest.xfail("SMDA doesn't support operand offsets")
fixtures.do_test_feature_presence(fixtures.get_smda_extractor, sample, scope, feature, expected)