mirror of
https://github.com/mandiant/capa.git
synced 2025-12-12 23:59:48 -08:00
Compare commits
7 Commits
v9.3.0
...
dex-suppor
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec1ddb506c | ||
|
|
e2f655428e | ||
|
|
b5a4d766d9 | ||
|
|
b77103a646 | ||
|
|
036f147df8 | ||
|
|
52d20d2f46 | ||
|
|
e90be5a9bb |
@@ -177,6 +177,34 @@ class DNTokenOffsetAddress(Address):
|
|||||||
return self.token + self.offset
|
return self.token + self.offset
|
||||||
|
|
||||||
|
|
||||||
|
class DexMethodAddress(int, Address):
|
||||||
|
def __new__(cls, offset: int):
|
||||||
|
return int.__new__(cls, offset)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"DexMethodAddress(offset={hex(self)})"
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return repr(self)
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return int.__hash__(self)
|
||||||
|
|
||||||
|
|
||||||
|
class DexClassAddress(int, Address):
|
||||||
|
def __new__(cls, offset: int):
|
||||||
|
return int.__new__(cls, offset)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"DexClassAddress(offset={hex(self)})"
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return repr(self)
|
||||||
|
|
||||||
|
def __hash__(self):
|
||||||
|
return int.__hash__(self)
|
||||||
|
|
||||||
|
|
||||||
class _NoAddress(Address):
|
class _NoAddress(Address):
|
||||||
def __eq__(self, other):
|
def __eq__(self, other):
|
||||||
return True
|
return True
|
||||||
|
|||||||
@@ -409,7 +409,9 @@ ARCH_I386 = "i386"
|
|||||||
ARCH_AMD64 = "amd64"
|
ARCH_AMD64 = "amd64"
|
||||||
# dotnet
|
# dotnet
|
||||||
ARCH_ANY = "any"
|
ARCH_ANY = "any"
|
||||||
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY)
|
# dex
|
||||||
|
ARCH_DALVIK = "dalvik"
|
||||||
|
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_ANY, ARCH_DALVIK)
|
||||||
|
|
||||||
|
|
||||||
class Arch(Feature):
|
class Arch(Feature):
|
||||||
@@ -421,10 +423,11 @@ class Arch(Feature):
|
|||||||
OS_WINDOWS = "windows"
|
OS_WINDOWS = "windows"
|
||||||
OS_LINUX = "linux"
|
OS_LINUX = "linux"
|
||||||
OS_MACOS = "macos"
|
OS_MACOS = "macos"
|
||||||
|
OS_ANDROID = "android"
|
||||||
# dotnet
|
# dotnet
|
||||||
OS_ANY = "any"
|
OS_ANY = "any"
|
||||||
VALID_OS = {os.value for os in capa.features.extractors.elf.OS}
|
VALID_OS = {os.value for os in capa.features.extractors.elf.OS}
|
||||||
VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY})
|
VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS, OS_ANY, OS_ANDROID})
|
||||||
# internal only, not to be used in rules
|
# internal only, not to be used in rules
|
||||||
OS_AUTO = "auto"
|
OS_AUTO = "auto"
|
||||||
|
|
||||||
@@ -452,7 +455,8 @@ class OS(Feature):
|
|||||||
FORMAT_PE = "pe"
|
FORMAT_PE = "pe"
|
||||||
FORMAT_ELF = "elf"
|
FORMAT_ELF = "elf"
|
||||||
FORMAT_DOTNET = "dotnet"
|
FORMAT_DOTNET = "dotnet"
|
||||||
VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET)
|
FORMAT_DEX = "dex"
|
||||||
|
VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, FORMAT_DEX)
|
||||||
# internal only, not to be used in rules
|
# internal only, not to be used in rules
|
||||||
FORMAT_AUTO = "auto"
|
FORMAT_AUTO = "auto"
|
||||||
FORMAT_SC32 = "sc32"
|
FORMAT_SC32 = "sc32"
|
||||||
@@ -464,6 +468,7 @@ STATIC_FORMATS = {
|
|||||||
FORMAT_PE,
|
FORMAT_PE,
|
||||||
FORMAT_ELF,
|
FORMAT_ELF,
|
||||||
FORMAT_DOTNET,
|
FORMAT_DOTNET,
|
||||||
|
FORMAT_DEX,
|
||||||
}
|
}
|
||||||
DYNAMIC_FORMATS = {
|
DYNAMIC_FORMATS = {
|
||||||
FORMAT_CAPE,
|
FORMAT_CAPE,
|
||||||
|
|||||||
@@ -24,8 +24,11 @@ from capa.features.common import (
|
|||||||
OS_AUTO,
|
OS_AUTO,
|
||||||
ARCH_ANY,
|
ARCH_ANY,
|
||||||
FORMAT_PE,
|
FORMAT_PE,
|
||||||
|
FORMAT_DEX,
|
||||||
FORMAT_ELF,
|
FORMAT_ELF,
|
||||||
|
OS_ANDROID,
|
||||||
OS_WINDOWS,
|
OS_WINDOWS,
|
||||||
|
ARCH_DALVIK,
|
||||||
FORMAT_FREEZE,
|
FORMAT_FREEZE,
|
||||||
FORMAT_RESULT,
|
FORMAT_RESULT,
|
||||||
Arch,
|
Arch,
|
||||||
@@ -41,6 +44,7 @@ logger = logging.getLogger(__name__)
|
|||||||
# match strings for formats
|
# match strings for formats
|
||||||
MATCH_PE = b"MZ"
|
MATCH_PE = b"MZ"
|
||||||
MATCH_ELF = b"\x7fELF"
|
MATCH_ELF = b"\x7fELF"
|
||||||
|
MATCH_DEX = b"dex\n"
|
||||||
MATCH_RESULT = b'{"meta":'
|
MATCH_RESULT = b'{"meta":'
|
||||||
MATCH_JSON_OBJECT = b'{"'
|
MATCH_JSON_OBJECT = b'{"'
|
||||||
|
|
||||||
@@ -61,6 +65,8 @@ def extract_format(buf) -> Iterator[Tuple[Feature, Address]]:
|
|||||||
yield Format(FORMAT_PE), NO_ADDRESS
|
yield Format(FORMAT_PE), NO_ADDRESS
|
||||||
elif buf.startswith(MATCH_ELF):
|
elif buf.startswith(MATCH_ELF):
|
||||||
yield Format(FORMAT_ELF), NO_ADDRESS
|
yield Format(FORMAT_ELF), NO_ADDRESS
|
||||||
|
elif len(buf) > 8 and buf.startswith(MATCH_DEX) and buf[7] == 0x00:
|
||||||
|
yield Format(FORMAT_DEX), NO_ADDRESS
|
||||||
elif is_freeze(buf):
|
elif is_freeze(buf):
|
||||||
yield Format(FORMAT_FREEZE), NO_ADDRESS
|
yield Format(FORMAT_FREEZE), NO_ADDRESS
|
||||||
elif buf.startswith(MATCH_RESULT):
|
elif buf.startswith(MATCH_RESULT):
|
||||||
@@ -96,6 +102,9 @@ def extract_arch(buf) -> Iterator[Tuple[Feature, Address]]:
|
|||||||
|
|
||||||
yield Arch(arch), NO_ADDRESS
|
yield Arch(arch), NO_ADDRESS
|
||||||
|
|
||||||
|
elif len(buf) > 8 and buf.startswith(MATCH_DEX) and buf[7] == 0x00:
|
||||||
|
yield Arch(ARCH_DALVIK), NO_ADDRESS
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# we likely end up here:
|
# we likely end up here:
|
||||||
# 1. handling shellcode, or
|
# 1. handling shellcode, or
|
||||||
@@ -129,6 +138,9 @@ def extract_os(buf, os=OS_AUTO) -> Iterator[Tuple[Feature, Address]]:
|
|||||||
|
|
||||||
yield OS(os), NO_ADDRESS
|
yield OS(os), NO_ADDRESS
|
||||||
|
|
||||||
|
elif len(buf) > 8 and buf.startswith(MATCH_DEX) and buf[7] == 0x00:
|
||||||
|
yield OS(OS_ANDROID), NO_ADDRESS
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# we likely end up here:
|
# we likely end up here:
|
||||||
# 1. handling shellcode, or
|
# 1. handling shellcode, or
|
||||||
|
|||||||
421
capa/features/extractors/dexfile.py
Normal file
421
capa/features/extractors/dexfile.py
Normal file
@@ -0,0 +1,421 @@
|
|||||||
|
# Copyright (C) 2023 Mandiant, Inc. All Rights Reserved.
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at: [package root]/LICENSE.txt
|
||||||
|
# Unless required by applicable law or agreed to in writing, software distributed under the License
|
||||||
|
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and limitations under the License.
|
||||||
|
import struct
|
||||||
|
import logging
|
||||||
|
from typing import Set, Dict, List, Tuple, Iterator, Optional, TypedDict
|
||||||
|
from pathlib import Path
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
import dexparser.disassembler as disassembler
|
||||||
|
from dexparser import DEXParser, uleb128_value
|
||||||
|
|
||||||
|
from capa.features.file import Import, FunctionName
|
||||||
|
from capa.features.common import (
|
||||||
|
OS,
|
||||||
|
FORMAT_DEX,
|
||||||
|
OS_ANDROID,
|
||||||
|
ARCH_DALVIK,
|
||||||
|
Arch,
|
||||||
|
Class,
|
||||||
|
Format,
|
||||||
|
String,
|
||||||
|
Feature,
|
||||||
|
Namespace,
|
||||||
|
)
|
||||||
|
from capa.features.address import NO_ADDRESS, Address, DexClassAddress, DexMethodAddress, FileOffsetAddress
|
||||||
|
from capa.features.extractors.base_extractor import (
|
||||||
|
BBHandle,
|
||||||
|
InsnHandle,
|
||||||
|
SampleHashes,
|
||||||
|
FunctionHandle,
|
||||||
|
StaticFeatureExtractor,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
# Reference: https://source.android.com/docs/core/runtime/dex-format
|
||||||
|
|
||||||
|
|
||||||
|
class DexProtoId(TypedDict):
|
||||||
|
shorty_idx: int
|
||||||
|
return_type_idx: int
|
||||||
|
param_off: int
|
||||||
|
|
||||||
|
|
||||||
|
class DexMethodId(TypedDict):
|
||||||
|
class_idx: int
|
||||||
|
proto_idx: int
|
||||||
|
name_idx: int
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DexAnalyzedMethod:
|
||||||
|
class_type: str
|
||||||
|
name: str
|
||||||
|
shorty_descriptor: str
|
||||||
|
return_type: str
|
||||||
|
parameters: List[str]
|
||||||
|
id_offset: int = 0
|
||||||
|
code_offset: int = 0
|
||||||
|
access_flags: Optional[int] = None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def address(self):
|
||||||
|
# NOTE: Some methods do not have code, in that case we use the method_id offset
|
||||||
|
if self.has_code:
|
||||||
|
return self.code_offset
|
||||||
|
else:
|
||||||
|
return self.id_offset
|
||||||
|
|
||||||
|
@property
|
||||||
|
def has_code(self):
|
||||||
|
# NOTE: code_offset is zero if the method is abstract/native or not defined in a class
|
||||||
|
return self.code_offset != 0
|
||||||
|
|
||||||
|
@property
|
||||||
|
def has_definition(self):
|
||||||
|
# NOTE: access_flags is only known if the method is defined in a class
|
||||||
|
return self.access_flags is not None
|
||||||
|
|
||||||
|
@property
|
||||||
|
def qualified_name(self):
|
||||||
|
return f"{self.class_type}::{self.name}"
|
||||||
|
|
||||||
|
|
||||||
|
class DexFieldId(TypedDict):
|
||||||
|
class_idx: int
|
||||||
|
type_idx: int
|
||||||
|
name_idx: int
|
||||||
|
|
||||||
|
|
||||||
|
class DexClassDef(TypedDict):
|
||||||
|
class_idx: int
|
||||||
|
access_flags: int
|
||||||
|
superclass_idx: int
|
||||||
|
interfaces_off: int
|
||||||
|
source_file_idx: int
|
||||||
|
annotations_off: int
|
||||||
|
class_data_off: int
|
||||||
|
static_values_off: int
|
||||||
|
|
||||||
|
|
||||||
|
class DexFieldDef(TypedDict):
|
||||||
|
diff: int
|
||||||
|
access_flags: int
|
||||||
|
|
||||||
|
|
||||||
|
class DexMethodDef(TypedDict):
|
||||||
|
diff: int
|
||||||
|
access_flags: int
|
||||||
|
code_off: int
|
||||||
|
|
||||||
|
|
||||||
|
class DexClassData(TypedDict):
|
||||||
|
static_fields: List[DexFieldDef]
|
||||||
|
instance_fields: List[DexFieldDef]
|
||||||
|
direct_methods: List[DexMethodDef]
|
||||||
|
virtual_methods: List[DexMethodDef]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DexAnalyzedClass:
|
||||||
|
offset: int
|
||||||
|
class_type: str
|
||||||
|
superclass_type: str
|
||||||
|
interfaces: List[str]
|
||||||
|
source_file: str
|
||||||
|
data: Optional[DexClassData]
|
||||||
|
|
||||||
|
|
||||||
|
class DexAnnotation(TypedDict):
|
||||||
|
visibility: int
|
||||||
|
type_idx_diff: int
|
||||||
|
size_diff: int
|
||||||
|
name_idx_diff: int
|
||||||
|
value_type: int
|
||||||
|
encoded_value: int
|
||||||
|
|
||||||
|
|
||||||
|
class DexAnalysis:
|
||||||
|
def get_strings(self):
|
||||||
|
# NOTE: Copied from dexparser, upstream later
|
||||||
|
|
||||||
|
strings: List[Tuple[int, bytes]] = []
|
||||||
|
string_ids_off = self.dex.header_data["string_ids_off"]
|
||||||
|
|
||||||
|
for i in range(self.dex.header_data["string_ids_size"]):
|
||||||
|
offset = struct.unpack("<L", self.dex.data[string_ids_off + (i * 4) : string_ids_off + (i * 4) + 4])[0]
|
||||||
|
c_size, size_offset = uleb128_value(self.dex.data, offset)
|
||||||
|
c_char = self.dex.data[offset + size_offset : offset + size_offset + c_size]
|
||||||
|
strings.append((offset, c_char))
|
||||||
|
|
||||||
|
return strings
|
||||||
|
|
||||||
|
def __init__(self, dex: DEXParser):
|
||||||
|
self.dex = dex
|
||||||
|
|
||||||
|
self.strings = self.get_strings()
|
||||||
|
self.strings_utf8: List[str] = []
|
||||||
|
for _, data in self.strings:
|
||||||
|
# NOTE: This is technically incorrect
|
||||||
|
# Reference: https://source.android.com/devices/tech/dalvik/dex-format#mutf-8
|
||||||
|
self.strings_utf8.append(data.decode("utf-8", errors="backslashreplace"))
|
||||||
|
|
||||||
|
self.type_ids: List[int] = dex.get_typeids()
|
||||||
|
self.method_ids: List[DexMethodId] = dex.get_methods()
|
||||||
|
self.proto_ids: List[DexProtoId] = dex.get_protoids()
|
||||||
|
self.field_ids: List[DexFieldId] = dex.get_fieldids()
|
||||||
|
self.class_defs: List[DexClassDef] = dex.get_classdef_data()
|
||||||
|
|
||||||
|
self._is_analyzing = True
|
||||||
|
self.used_classes: Set[str] = set()
|
||||||
|
self.classes = self._analyze_classes()
|
||||||
|
self.methods = self._analyze_methods()
|
||||||
|
self.methods_by_address: Dict[int, DexAnalyzedMethod] = {m.address: m for m in self.methods}
|
||||||
|
|
||||||
|
self.namespaces: Set[str] = set()
|
||||||
|
for class_type in self.used_classes:
|
||||||
|
idx = class_type.rfind(".")
|
||||||
|
if idx != -1:
|
||||||
|
self.namespaces.add(class_type[:idx])
|
||||||
|
|
||||||
|
for class_type in self.classes:
|
||||||
|
self.used_classes.remove(class_type)
|
||||||
|
|
||||||
|
# Only available after code analysis
|
||||||
|
self._is_analyzing = False
|
||||||
|
|
||||||
|
def analyze_code(self):
|
||||||
|
# Loop over the classes and analyze them
|
||||||
|
# self.classes: List[DexClass] = self.dex.get_class_data(offset=-1)
|
||||||
|
# self.annotations: List[DexAnnotation] = dex.get_annotations(offset=-1)
|
||||||
|
# self.static_values: List[int] = dex.get_static_values(offset=-1)
|
||||||
|
pass
|
||||||
|
|
||||||
|
def get_string(self, index: int) -> str:
|
||||||
|
return self.strings_utf8[index]
|
||||||
|
|
||||||
|
def _decode_descriptor(self, descriptor: str) -> str:
|
||||||
|
first = descriptor[0]
|
||||||
|
if first == "L":
|
||||||
|
pretty = descriptor[1:-1].replace("/", ".")
|
||||||
|
if self._is_analyzing:
|
||||||
|
self.used_classes.add(pretty)
|
||||||
|
elif first == "[":
|
||||||
|
pretty = self._decode_descriptor(descriptor[1:]) + "[]"
|
||||||
|
else:
|
||||||
|
pretty = disassembler.type_descriptor[first]
|
||||||
|
return pretty
|
||||||
|
|
||||||
|
def get_pretty_type(self, index: int) -> str:
|
||||||
|
if index == 0xFFFFFFFF:
|
||||||
|
return "<NO_INDEX>"
|
||||||
|
descriptor = self.get_string(self.type_ids[index])
|
||||||
|
return self._decode_descriptor(descriptor)
|
||||||
|
|
||||||
|
def _analyze_classes(self):
|
||||||
|
classes: Dict[str, DexAnalyzedClass] = {}
|
||||||
|
offset = self.dex.header_data["class_defs_off"]
|
||||||
|
for index, clazz in enumerate(self.class_defs):
|
||||||
|
class_type = self.get_pretty_type(clazz["class_idx"])
|
||||||
|
|
||||||
|
# Superclass
|
||||||
|
superclass_idx = clazz["superclass_idx"]
|
||||||
|
if superclass_idx != 0xFFFFFFFF:
|
||||||
|
superclass_type = self.get_pretty_type(superclass_idx)
|
||||||
|
else:
|
||||||
|
superclass_type = ""
|
||||||
|
|
||||||
|
# Interfaces
|
||||||
|
interfaces = []
|
||||||
|
interfaces_offset = clazz["interfaces_off"]
|
||||||
|
if interfaces_offset != 0:
|
||||||
|
size = struct.unpack("<L", self.dex.data[interfaces_offset : interfaces_offset + 4])[0]
|
||||||
|
for i in range(size):
|
||||||
|
type_idx = struct.unpack(
|
||||||
|
"<H", self.dex.data[interfaces_offset + 4 + i * 2 : interfaces_offset + 6 + i * 2]
|
||||||
|
)[0]
|
||||||
|
interface_type = self.get_pretty_type(type_idx)
|
||||||
|
interfaces.append(interface_type)
|
||||||
|
|
||||||
|
# Source file
|
||||||
|
source_file_idx = clazz["source_file_idx"]
|
||||||
|
if source_file_idx != 0xFFFFFFFF:
|
||||||
|
source_file = self.get_string(source_file_idx)
|
||||||
|
else:
|
||||||
|
source_file = ""
|
||||||
|
|
||||||
|
# Data
|
||||||
|
data_offset = clazz["class_data_off"]
|
||||||
|
if data_offset != 0:
|
||||||
|
data = self.dex.get_class_data(data_offset)
|
||||||
|
else:
|
||||||
|
data = None
|
||||||
|
|
||||||
|
classes[class_type] = DexAnalyzedClass(
|
||||||
|
offset=offset + index * 32,
|
||||||
|
class_type=class_type,
|
||||||
|
superclass_type=superclass_type,
|
||||||
|
interfaces=interfaces,
|
||||||
|
source_file=source_file,
|
||||||
|
data=data,
|
||||||
|
)
|
||||||
|
return classes
|
||||||
|
|
||||||
|
def _analyze_methods(self):
|
||||||
|
methods: List[DexAnalyzedMethod] = []
|
||||||
|
for method_id in self.method_ids:
|
||||||
|
proto = self.proto_ids[method_id["proto_idx"]]
|
||||||
|
parameters = []
|
||||||
|
|
||||||
|
param_off = proto["param_off"]
|
||||||
|
if param_off != 0:
|
||||||
|
size = struct.unpack("<L", self.dex.data[param_off : param_off + 4])[0]
|
||||||
|
for i in range(size):
|
||||||
|
type_idx = struct.unpack("<H", self.dex.data[param_off + 4 + i * 2 : param_off + 6 + i * 2])[0]
|
||||||
|
param_type = self.get_pretty_type(type_idx)
|
||||||
|
parameters.append(param_type)
|
||||||
|
|
||||||
|
methods.append(
|
||||||
|
DexAnalyzedMethod(
|
||||||
|
class_type=self.get_pretty_type(method_id["class_idx"]),
|
||||||
|
name=self.get_string(method_id["name_idx"]),
|
||||||
|
shorty_descriptor=self.get_string(proto["shorty_idx"]),
|
||||||
|
return_type=self.get_pretty_type(proto["return_type_idx"]),
|
||||||
|
parameters=parameters,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fill in the missing method data
|
||||||
|
for clazz in self.classes.values():
|
||||||
|
if clazz.data is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for method_def in clazz.data["direct_methods"]:
|
||||||
|
diff = method_def["diff"]
|
||||||
|
methods[diff].access_flags = method_def["access_flags"]
|
||||||
|
methods[diff].code_offset = method_def["code_off"]
|
||||||
|
|
||||||
|
for method_def in clazz.data["virtual_methods"]:
|
||||||
|
diff = method_def["diff"]
|
||||||
|
methods[diff].access_flags = method_def["access_flags"]
|
||||||
|
methods[diff].code_offset = method_def["code_off"]
|
||||||
|
|
||||||
|
# Fill in the missing code offsets with fake data
|
||||||
|
offset = self.dex.header_data["method_ids_off"]
|
||||||
|
for index, method in enumerate(methods):
|
||||||
|
method.id_offset = offset + index * 8
|
||||||
|
|
||||||
|
return methods
|
||||||
|
|
||||||
|
def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
|
||||||
|
yield Format(FORMAT_DEX), NO_ADDRESS
|
||||||
|
|
||||||
|
for i in range(len(self.strings)):
|
||||||
|
yield String(self.strings_utf8[i]), FileOffsetAddress(self.strings[i][0])
|
||||||
|
|
||||||
|
for method in self.methods:
|
||||||
|
if method.has_definition:
|
||||||
|
yield FunctionName(method.qualified_name), DexMethodAddress(method.address)
|
||||||
|
else:
|
||||||
|
yield Import(method.qualified_name), DexMethodAddress(method.address)
|
||||||
|
|
||||||
|
for namespace in self.namespaces:
|
||||||
|
yield Namespace(namespace), NO_ADDRESS
|
||||||
|
|
||||||
|
for clazz in self.classes.values():
|
||||||
|
yield Class(clazz.class_type), DexClassAddress(clazz.offset)
|
||||||
|
|
||||||
|
for class_type in self.used_classes:
|
||||||
|
yield Class(class_type), NO_ADDRESS
|
||||||
|
|
||||||
|
|
||||||
|
class DexFeatureExtractor(StaticFeatureExtractor):
|
||||||
|
def __init__(self, path: Path, *, code_analysis: bool):
|
||||||
|
super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes()))
|
||||||
|
self.path: Path = path
|
||||||
|
self.code_analysis = code_analysis
|
||||||
|
self.dex = DEXParser(filedir=str(path))
|
||||||
|
self.analysis = DexAnalysis(self.dex)
|
||||||
|
|
||||||
|
# Perform more expensive code analysis only when requested
|
||||||
|
if self.code_analysis:
|
||||||
|
self.analysis.analyze_code()
|
||||||
|
|
||||||
|
def todo(self):
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
message = "[DexparserFeatureExtractor:TODO] " + inspect.stack()[1].function
|
||||||
|
logger.debug(message)
|
||||||
|
|
||||||
|
def get_base_address(self):
|
||||||
|
return NO_ADDRESS
|
||||||
|
|
||||||
|
def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
|
||||||
|
# These are hardcoded global features
|
||||||
|
yield Format(FORMAT_DEX), NO_ADDRESS
|
||||||
|
yield OS(OS_ANDROID), NO_ADDRESS
|
||||||
|
yield Arch(ARCH_DALVIK), NO_ADDRESS
|
||||||
|
|
||||||
|
def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]:
|
||||||
|
yield from self.analysis.extract_file_features()
|
||||||
|
|
||||||
|
def is_library_function(self, addr: Address) -> bool:
|
||||||
|
assert isinstance(addr, DexMethodAddress)
|
||||||
|
method = self.analysis.methods_by_address[addr]
|
||||||
|
# exclude androidx/kotlin stuff?
|
||||||
|
return not method.has_definition
|
||||||
|
|
||||||
|
def get_function_name(self, addr: Address) -> str:
|
||||||
|
assert isinstance(addr, DexMethodAddress)
|
||||||
|
method = self.analysis.methods_by_address[addr]
|
||||||
|
return method.qualified_name
|
||||||
|
|
||||||
|
def get_functions(self) -> Iterator[FunctionHandle]:
|
||||||
|
if not self.code_analysis:
|
||||||
|
raise Exception("code analysis is disabled")
|
||||||
|
|
||||||
|
for method in self.analysis.methods:
|
||||||
|
yield FunctionHandle(DexMethodAddress(method.address), method)
|
||||||
|
|
||||||
|
def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]:
|
||||||
|
if not self.code_analysis:
|
||||||
|
raise Exception("code analysis is disabled")
|
||||||
|
method: DexAnalyzedMethod = f.inner
|
||||||
|
if method.has_code:
|
||||||
|
return self.todo()
|
||||||
|
yield
|
||||||
|
|
||||||
|
def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]:
|
||||||
|
if not self.code_analysis:
|
||||||
|
raise Exception("code analysis is disabled")
|
||||||
|
method: DexAnalyzedMethod = f.inner
|
||||||
|
if method.has_code:
|
||||||
|
return self.todo()
|
||||||
|
yield
|
||||||
|
|
||||||
|
def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Feature, Address]]:
|
||||||
|
if not self.code_analysis:
|
||||||
|
raise Exception("code analysis is disabled")
|
||||||
|
return self.todo()
|
||||||
|
yield
|
||||||
|
|
||||||
|
def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]:
|
||||||
|
if not self.code_analysis:
|
||||||
|
raise Exception("code analysis is disabled")
|
||||||
|
return self.todo()
|
||||||
|
yield
|
||||||
|
|
||||||
|
def extract_insn_features(
|
||||||
|
self, f: FunctionHandle, bb: BBHandle, insn: InsnHandle
|
||||||
|
) -> Iterator[Tuple[Feature, Address]]:
|
||||||
|
if not self.code_analysis:
|
||||||
|
raise Exception("code analysis is disabled")
|
||||||
|
return self.todo()
|
||||||
|
yield
|
||||||
@@ -53,6 +53,8 @@ class AddressType(str, Enum):
|
|||||||
FILE = "file"
|
FILE = "file"
|
||||||
DN_TOKEN = "dn token"
|
DN_TOKEN = "dn token"
|
||||||
DN_TOKEN_OFFSET = "dn token offset"
|
DN_TOKEN_OFFSET = "dn token offset"
|
||||||
|
DEX_METHOD_INDEX = "dex method index"
|
||||||
|
DEX_CLASS_INDEX = "dex class index"
|
||||||
PROCESS = "process"
|
PROCESS = "process"
|
||||||
THREAD = "thread"
|
THREAD = "thread"
|
||||||
CALL = "call"
|
CALL = "call"
|
||||||
@@ -80,6 +82,12 @@ class Address(HashableModel):
|
|||||||
elif isinstance(a, capa.features.address.DNTokenOffsetAddress):
|
elif isinstance(a, capa.features.address.DNTokenOffsetAddress):
|
||||||
return cls(type=AddressType.DN_TOKEN_OFFSET, value=(a.token, a.offset))
|
return cls(type=AddressType.DN_TOKEN_OFFSET, value=(a.token, a.offset))
|
||||||
|
|
||||||
|
elif isinstance(a, capa.features.address.DexMethodAddress):
|
||||||
|
return cls(type=AddressType.DEX_METHOD_INDEX, value=int(a))
|
||||||
|
|
||||||
|
elif isinstance(a, capa.features.address.DexClassAddress):
|
||||||
|
return cls(type=AddressType.DEX_CLASS_INDEX, value=int(a))
|
||||||
|
|
||||||
elif isinstance(a, capa.features.address.ProcessAddress):
|
elif isinstance(a, capa.features.address.ProcessAddress):
|
||||||
return cls(type=AddressType.PROCESS, value=(a.ppid, a.pid))
|
return cls(type=AddressType.PROCESS, value=(a.ppid, a.pid))
|
||||||
|
|
||||||
@@ -125,6 +133,14 @@ class Address(HashableModel):
|
|||||||
assert isinstance(offset, int)
|
assert isinstance(offset, int)
|
||||||
return capa.features.address.DNTokenOffsetAddress(token, offset)
|
return capa.features.address.DNTokenOffsetAddress(token, offset)
|
||||||
|
|
||||||
|
elif self.type is AddressType.DEX_METHOD_INDEX:
|
||||||
|
assert isinstance(self.value, int)
|
||||||
|
return capa.features.address.DexMethodAddress(self.value)
|
||||||
|
|
||||||
|
elif self.type is AddressType.DEX_CLASS_INDEX:
|
||||||
|
assert isinstance(self.value, int)
|
||||||
|
return capa.features.address.DexClassAddress(self.value)
|
||||||
|
|
||||||
elif self.type is AddressType.PROCESS:
|
elif self.type is AddressType.PROCESS:
|
||||||
assert isinstance(self.value, tuple)
|
assert isinstance(self.value, tuple)
|
||||||
ppid, pid = self.value
|
ppid, pid = self.value
|
||||||
|
|||||||
11
capa/main.py
11
capa/main.py
@@ -45,6 +45,7 @@ import capa.render.result_document
|
|||||||
import capa.render.result_document as rdoc
|
import capa.render.result_document as rdoc
|
||||||
import capa.features.extractors.common
|
import capa.features.extractors.common
|
||||||
import capa.features.extractors.pefile
|
import capa.features.extractors.pefile
|
||||||
|
import capa.features.extractors.dexfile
|
||||||
import capa.features.extractors.elffile
|
import capa.features.extractors.elffile
|
||||||
import capa.features.extractors.dotnetfile
|
import capa.features.extractors.dotnetfile
|
||||||
import capa.features.extractors.base_extractor
|
import capa.features.extractors.base_extractor
|
||||||
@@ -72,6 +73,7 @@ from capa.features.common import (
|
|||||||
OS_LINUX,
|
OS_LINUX,
|
||||||
OS_MACOS,
|
OS_MACOS,
|
||||||
FORMAT_PE,
|
FORMAT_PE,
|
||||||
|
FORMAT_DEX,
|
||||||
FORMAT_ELF,
|
FORMAT_ELF,
|
||||||
OS_WINDOWS,
|
OS_WINDOWS,
|
||||||
FORMAT_AUTO,
|
FORMAT_AUTO,
|
||||||
@@ -307,6 +309,11 @@ def get_extractor(
|
|||||||
|
|
||||||
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
|
return capa.features.extractors.dnfile.extractor.DnfileFeatureExtractor(path)
|
||||||
|
|
||||||
|
elif format_ == FORMAT_DEX:
|
||||||
|
import capa.features.extractors.dexfile
|
||||||
|
|
||||||
|
return capa.features.extractors.dexfile.DexFeatureExtractor(path, code_analysis=True)
|
||||||
|
|
||||||
elif backend == BACKEND_BINJA:
|
elif backend == BACKEND_BINJA:
|
||||||
from capa.features.extractors.binja.find_binja_api import find_binja_path
|
from capa.features.extractors.binja.find_binja_api import find_binja_path
|
||||||
|
|
||||||
@@ -375,6 +382,9 @@ def get_file_extractors(sample: Path, format_: str) -> List[FeatureExtractor]:
|
|||||||
elif format_ == capa.features.common.FORMAT_ELF:
|
elif format_ == capa.features.common.FORMAT_ELF:
|
||||||
file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample))
|
file_extractors.append(capa.features.extractors.elffile.ElfFeatureExtractor(sample))
|
||||||
|
|
||||||
|
elif format_ == capa.features.common.FORMAT_DEX:
|
||||||
|
file_extractors.append(capa.features.extractors.dexfile.DexFeatureExtractor(sample, code_analysis=False))
|
||||||
|
|
||||||
elif format_ == FORMAT_CAPE:
|
elif format_ == FORMAT_CAPE:
|
||||||
report = json.load(Path(sample).open(encoding="utf-8"))
|
report = json.load(Path(sample).open(encoding="utf-8"))
|
||||||
file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
|
file_extractors.append(capa.features.extractors.cape.extractor.CapeExtractor.from_report(report))
|
||||||
@@ -797,6 +807,7 @@ def install_common_args(parser, wanted=None):
|
|||||||
(FORMAT_PE, "Windows PE file"),
|
(FORMAT_PE, "Windows PE file"),
|
||||||
(FORMAT_DOTNET, ".NET PE file"),
|
(FORMAT_DOTNET, ".NET PE file"),
|
||||||
(FORMAT_ELF, "Executable and Linkable Format"),
|
(FORMAT_ELF, "Executable and Linkable Format"),
|
||||||
|
(FORMAT_DEX, "Android DEX file"),
|
||||||
(FORMAT_SC32, "32-bit shellcode"),
|
(FORMAT_SC32, "32-bit shellcode"),
|
||||||
(FORMAT_SC64, "64-bit shellcode"),
|
(FORMAT_SC64, "64-bit shellcode"),
|
||||||
(FORMAT_CAPE, "CAPE sandbox report"),
|
(FORMAT_CAPE, "CAPE sandbox report"),
|
||||||
|
|||||||
@@ -54,6 +54,12 @@ def format_address(address: frz.Address) -> str:
|
|||||||
assert isinstance(token, int)
|
assert isinstance(token, int)
|
||||||
assert isinstance(offset, int)
|
assert isinstance(offset, int)
|
||||||
return f"token({capa.helpers.hex(token)})+{capa.helpers.hex(offset)}"
|
return f"token({capa.helpers.hex(token)})+{capa.helpers.hex(offset)}"
|
||||||
|
elif address.type == frz.AddressType.DEX_METHOD_INDEX:
|
||||||
|
assert isinstance(address.value, int)
|
||||||
|
return f"method({capa.helpers.hex(address.value)})"
|
||||||
|
elif address.type == frz.AddressType.DEX_CLASS_INDEX:
|
||||||
|
assert isinstance(address.value, int)
|
||||||
|
return f"class({capa.helpers.hex(address.value)})"
|
||||||
elif address.type == frz.AddressType.PROCESS:
|
elif address.type == frz.AddressType.PROCESS:
|
||||||
assert isinstance(address.value, tuple)
|
assert isinstance(address.value, tuple)
|
||||||
ppid, pid = address.value
|
ppid, pid = address.value
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ dependencies = [
|
|||||||
"dncil==1.0.2",
|
"dncil==1.0.2",
|
||||||
"pydantic==2.4.0",
|
"pydantic==2.4.0",
|
||||||
"protobuf==4.23.4",
|
"protobuf==4.23.4",
|
||||||
|
"dexparser==1.2.0",
|
||||||
]
|
]
|
||||||
dynamic = ["version"]
|
dynamic = ["version"]
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user