Merge pull request #977 from mandiant/feature-320

extract extra offset/number features
This commit is contained in:
Willi Ballenthin
2022-04-07 14:20:10 -06:00
committed by GitHub
8 changed files with 157 additions and 18 deletions

View File

@@ -7,6 +7,7 @@
- add new scope "instruction" for matching mnemonics and operands #767 @williballenthin
- add new feature "operand[{0, 1, 2}].number" for matching instruction operand immediate values #767 @williballenthin
- add new feature "operand[{0, 1, 2}].offset" for matching instruction operand offsets #767 @williballenthin
- extract additional offset/number features in certain circumstances #320 @williballenthin
### Breaking Changes

View File

@@ -382,3 +382,8 @@ def get_function_blocks(f):
def is_basic_block_return(bb):
"""check if basic block is return block"""
return bb.type == idaapi.fcb_ret
def has_sib(oper) -> bool:
# via: https://reverseengineering.stackexchange.com/a/14300
return oper.specflag1 == 1

View File

@@ -12,7 +12,7 @@ import idautils
import capa.features.extractors.helpers
import capa.features.extractors.ida.helpers
from capa.features.insn import API, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Characteristic
# security cookie checks may perform non-zeroing XORs, these are expected within a certain
@@ -135,6 +135,15 @@ def extract_insn_number_features(f, bb, insn):
yield Number(const), insn.ea
yield OperandNumber(i, const), insn.ea
if insn.itype == idaapi.NN_add and 0 < const < MAX_STRUCTURE_SIZE and op.type == idaapi.o_imm:
# for pattern like:
#
# add eax, 0x10
#
# assume 0x10 is also an offset (imagine eax is a pointer).
yield Offset(const), insn.ea
yield OperandOffset(i, const), insn.ea
def extract_insn_bytes_features(f, bb, insn):
"""parse referenced byte sequences
@@ -209,6 +218,25 @@ def extract_insn_offset_features(f, bb, insn):
yield Offset(op_off), insn.ea
yield OperandOffset(i, op_off), insn.ea
if (
insn.itype == idaapi.NN_lea
and i == 1
# o_displ is used for both:
# [eax+1]
# [eax+ebx+2]
and op.type == idaapi.o_displ
# but the SIB is only present for [eax+ebx+2]
# which we don't want
and not capa.features.extractors.ida.helpers.has_sib(op)
):
# for pattern like:
#
# lea eax, [ebx + 1]
#
# assume 1 is also an offset (imagine ebx is a zero register).
yield Number(op_off), insn.ea
yield OperandNumber(i, op_off), insn.ea
def contains_stack_cookie_keywords(s):
"""check if string contains stack cookie keywords

View File

@@ -5,7 +5,7 @@ import struct
from smda.common.SmdaReport import SmdaReport
import capa.features.extractors.helpers
from capa.features.insn import API, Number, Offset, Mnemonic
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Characteristic
# security cookie checks may perform non-zeroing XORs, these are expected within a certain
@@ -64,15 +64,25 @@ def extract_insn_number_features(f, bb, insn):
# .text:00401140 call sub_407E2B
# .text:00401145 add esp, 0Ch
return
for operand in operands:
for i, operand in enumerate(operands):
try:
# The result of bitwise operations is calculated as though carried out
# in twos complement with an infinite number of sign bits
value = int(operand, 16) & ((1 << f.smda_report.bitness) - 1)
yield Number(value), insn.offset
except:
except ValueError:
continue
else:
yield Number(value), insn.offset
yield OperandNumber(i, value), insn.offset
if insn.mnemonic == "add" and 0 < value < MAX_STRUCTURE_SIZE:
# for pattern like:
#
# add eax, 0x10
#
# assume 0x10 is also an offset (imagine eax is a pointer).
yield Offset(value), insn.offset
yield OperandOffset(i, value), insn.offset
def read_bytes(smda_report, va, num_bytes=None):
@@ -198,11 +208,10 @@ def extract_insn_offset_features(f, bb, insn):
# mov eax, [esi + 4]
# mov eax, [esi + ecx + 16384]
operands = [o.strip() for o in insn.operands.split(",")]
for operand in operands:
if "ptr" not in operand:
continue
for i, operand in enumerate(operands):
if "esp" in operand or "ebp" in operand or "rbp" in operand:
continue
number = 0
number_hex = re.search(PATTERN_HEXNUM, operand)
number_int = re.search(PATTERN_SINGLENUM, operand)
@@ -212,7 +221,26 @@ def extract_insn_offset_features(f, bb, insn):
elif number_int:
number = int(number_int.group("num"))
number = -1 * number if number_int.group().startswith("-") else number
if "ptr" not in operand:
if (
insn.mnemonic == "lea"
and i == 1
and (operand.count("+") + operand.count("-")) == 1
and operand.count("*") == 0
):
# for pattern like:
#
# lea eax, [ebx + 1]
#
# assume 1 is also an offset (imagine ebx is a zero register).
yield Number(number), insn.offset
yield OperandNumber(i, number), insn.offset
continue
yield Offset(number), insn.offset
yield OperandOffset(i, number), insn.offset
def is_security_cookie(f, bb, insn):

View File

@@ -17,7 +17,7 @@ import envi.archs.amd64.disasm
import capa.features.extractors.helpers
import capa.features.extractors.viv.helpers
from capa.features.insn import API, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.insn import API, MAX_STRUCTURE_SIZE, Number, Offset, Mnemonic, OperandNumber, OperandOffset
from capa.features.common import MAX_BYTES_FEATURE_SIZE, THUNK_CHAIN_DEPTH_DELTA, Bytes, String, Characteristic
from capa.features.extractors.viv.indirect_calls import NotFoundError, resolve_indirect_call
@@ -539,6 +539,15 @@ def extract_op_number_features(f, bb, insn, i, oper):
yield Number(v), insn.va
yield OperandNumber(i, v), insn.va
if insn.mnem == "add" and 0 < v < MAX_STRUCTURE_SIZE and isinstance(oper, envi.archs.i386.disasm.i386ImmOper):
# for pattern like:
#
# add eax, 0x10
#
# assume 0x10 is also an offset (imagine eax is a pointer).
yield Offset(v), insn.va
yield OperandOffset(i, v), insn.va
def extract_op_offset_features(f, bb, insn, i, oper):
"""parse structure offset features from the given operand."""
@@ -567,6 +576,15 @@ def extract_op_offset_features(f, bb, insn, i, oper):
yield Offset(v), insn.va
yield OperandOffset(i, v), insn.va
if insn.mnem == "lea" and i == 1 and not f.vw.probeMemory(v, 1, envi.memory.MM_READ):
# for pattern like:
#
# lea eax, [ebx + 1]
#
# assume 1 is also an offset (imagine ebx is a zero register).
yield Number(v), insn.va
yield OperandNumber(i, v), insn.va
# like: [esi + ecx + 16384]
# reg ^ ^
# index ^

View File

@@ -29,6 +29,10 @@ class Number(Feature):
return capa.render.utils.hex(self.value)
# max recognized structure size (and therefore, offset size)
MAX_STRUCTURE_SIZE = 0x10000
class Offset(Feature):
def __init__(self, value: int, description=None):
super(Offset, self).__init__(value, description=description)

View File

@@ -162,6 +162,14 @@ def extract_basic_block_features(extractor, f, bb):
return features
# f may not be hashable (e.g. ida func_t) so cannot @lru_cache this
def extract_instruction_features(extractor, f, bb, insn):
features = collections.defaultdict(set)
for feature, va in extractor.extract_insn_features(f, bb, insn):
features[feature].add(va)
return features
# note: too reduce the testing time it's recommended to reuse already existing test samples, if possible
def get_data_path_by_name(name):
if name == "mimikatz":
@@ -292,6 +300,13 @@ def get_basic_block(extractor, f, va):
raise ValueError("basic block not found")
def get_instruction(extractor, f, bb, va):
for insn in extractor.get_instructions(f, bb):
if int(insn) == va:
return insn
raise ValueError("instruction not found")
def resolve_scope(scope):
if scope == "file":
@@ -303,8 +318,32 @@ def resolve_scope(scope):
inner_file.__name__ = scope
return inner_file
elif "insn=" in scope:
# like `function=0x401000,bb=0x40100A,insn=0x40100A`
assert "function=" in scope
assert "bb=" in scope
assert "insn=" in scope
fspec, _, spec = scope.partition(",")
bbspec, _, ispec = spec.partition(",")
fva = int(fspec.partition("=")[2], 0x10)
bbva = int(bbspec.partition("=")[2], 0x10)
iva = int(ispec.partition("=")[2], 0x10)
def inner_insn(extractor):
f = get_function(extractor, fva)
bb = get_basic_block(extractor, f, bbva)
insn = get_instruction(extractor, f, bb, iva)
features = extract_instruction_features(extractor, f, bb, insn)
for k, vs in extract_global_features(extractor).items():
features[k].update(vs)
return features
inner_insn.__name__ = scope
return inner_insn
elif "bb=" in scope:
# like `function=0x401000,bb=0x40100A`
assert "function=" in scope
assert "bb=" in scope
fspec, _, bbspec = scope.partition(",")
fva = int(fspec.partition("=")[2], 0x10)
bbva = int(bbspec.partition("=")[2], 0x10)
@@ -434,6 +473,30 @@ FEATURE_PRESENCE_TESTS = sorted(
# insn/offset: negative
("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x1), True),
("mimikatz", "function=0x4011FB", capa.features.insn.Offset(-0x2), True),
#
# insn/offset from mnemonic: add
#
# should not be considered, too big for an offset:
# .text:00401D85 81 C1 00 00 00 80 add ecx, 80000000h
("mimikatz", "function=0x401D64,bb=0x401D73,insn=0x401D85", capa.features.insn.Offset(0x80000000), False),
# should not be considered, relative to stack:
# .text:00401CF6 83 C4 10 add esp, 10h
("mimikatz", "function=0x401CC7,bb=0x401CDE,insn=0x401CF6", capa.features.insn.Offset(0x10), False),
# yes, this is also a offset (imagine eax is a pointer):
# .text:0040223C 83 C0 04 add eax, 4
("mimikatz", "function=0x402203,bb=0x402221,insn=0x40223C", capa.features.insn.Offset(0x4), True),
#
# insn/number from mnemonic: lea
#
# should not be considered, lea operand invalid encoding
# .text:00471EE6 8D 1C 81 lea ebx, [ecx+eax*4]
("mimikatz", "function=0x471EAB,bb=0x471ED8,insn=0x471EE6", capa.features.insn.Number(0x4), False),
# should not be considered, lea operand invalid encoding
# .text:004717B1 8D 4C 31 D0 lea ecx, [ecx+esi-30h]
("mimikatz", "function=0x47153B,bb=0x4717AB,insn=0x4717B1", capa.features.insn.Number(-0x30), False),
# yes, this is also a number (imagine edx is zero):
# .text:004018C0 8D 4B 02 lea ecx, [ebx+2]
("mimikatz", "function=0x401873,bb=0x4018B2,insn=0x4018C0", capa.features.insn.Number(0x2), True),
# insn/api
("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContextW"), True),
("mimikatz", "function=0x403BAC", capa.features.insn.API("advapi32.CryptAcquireContext"), True),

View File

@@ -22,14 +22,6 @@ def test_smda_features(sample, scope, feature, expected):
if scope.__name__ == "file" and isinstance(feature, capa.features.file.FunctionName) and expected is True:
pytest.xfail("SMDA has no function ID")
if "bb=" in scope.__name__ and isinstance(feature, capa.features.insn.OperandNumber) and expected is True:
# SMDA not currently maintained, see: https://github.com/mandiant/capa/issues/937
pytest.xfail("SMDA doesn't support operand numbers")
if "bb=" in scope.__name__ and isinstance(feature, capa.features.insn.OperandOffset) and expected is True:
# SMDA not currently maintained, see: https://github.com/mandiant/capa/issues/937
pytest.xfail("SMDA doesn't support operand offsets")
fixtures.do_test_feature_presence(fixtures.get_smda_extractor, sample, scope, feature, expected)