diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py index 3d2bf9d0..8fbfa915 100644 --- a/capa/features/extractors/vmray/__init__.py +++ b/capa/features/extractors/vmray/__init__.py @@ -5,10 +5,11 @@ # Unless required by applicable law or agreed to in writing, software distributed under the License # is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and limitations under the License. -from typing import Dict +from typing import Dict, List +from collections import defaultdict from capa.exceptions import UnsupportedFormatError -from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData +from capa.features.extractors.vmray.models import File, Flog, SummaryV2, StaticData, FunctionCall class VMRayAnalysis: @@ -18,6 +19,8 @@ class VMRayAnalysis: self.exports: Dict[int, str] = {} self.imports: Dict[int, str] = {} self.sections: Dict[int, str] = {} + self.process_threads: Dict[int, List[int]] = defaultdict(list) + self.process_calls: Dict[int, Dict[int, List[FunctionCall]]] = defaultdict(lambda: defaultdict(list)) self.base_address: int self.sample_file_name: str @@ -28,6 +31,8 @@ class VMRayAnalysis: self._compute_base_address() self._compute_exports() self._compute_sections() + self._compute_process_threads() + self._compute_process_calls() if not self.sample_file_static_data.pe: raise UnsupportedFormatError("VMRay feature extractor only supports PE at this time") @@ -61,3 +66,18 @@ class VMRayAnalysis: if self.sample_file_static_data.pe: for section in self.sample_file_static_data.pe.sections: self.sections[section.virtual_address] = section.name + + def _compute_process_threads(self): + for function_call in self.flog.analysis.function_calls: + pid: int = int(function_call.process_id) + tid: int = int(function_call.thread_id) + + if tid not in self.process_threads[pid]: + self.process_threads[pid].append(tid) + + def _compute_process_calls(self): + for function_call in self.flog.analysis.function_calls: + pid: int = int(function_call.process_id) + tid: int = int(function_call.thread_id) + + self.process_calls[pid][tid].append(function_call) diff --git a/capa/features/extractors/vmray/call.py b/capa/features/extractors/vmray/call.py index c4d117d2..c5f8d446 100644 --- a/capa/features/extractors/vmray/call.py +++ b/capa/features/extractors/vmray/call.py @@ -21,6 +21,10 @@ def extract_call_features(ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) - yields: Feature, address; where Feature is either: API, Number, or String. """ + # TODO update for new models + # print(ch) + return + # Extract API name yield API(ch.inner.name), ch.inner.address diff --git a/capa/features/extractors/vmray/extractor.py b/capa/features/extractors/vmray/extractor.py index 09450200..16607ac4 100644 --- a/capa/features/extractors/vmray/extractor.py +++ b/capa/features/extractors/vmray/extractor.py @@ -14,10 +14,11 @@ from zipfile import ZipFile import xmltodict import capa.helpers +import capa.features.extractors.vmray.call import capa.features.extractors.vmray.file import capa.features.extractors.vmray.global_ -from capa.features.common import Feature -from capa.features.address import Address, AbsoluteVirtualAddress +from capa.features.common import Feature, Characteristic +from capa.features.address import NO_ADDRESS, Address, ThreadAddress, DynamicCallAddress, AbsoluteVirtualAddress from capa.features.extractors.vmray import VMRayAnalysis from capa.features.extractors.vmray.models import Flog, Process, SummaryV2 from capa.features.extractors.base_extractor import ( @@ -28,8 +29,6 @@ from capa.features.extractors.base_extractor import ( DynamicFeatureExtractor, ) -# TODO also/or look into xmltodict? - class VMRayExtractor(DynamicFeatureExtractor): def __init__(self, analysis: VMRayAnalysis): @@ -68,23 +67,27 @@ class VMRayExtractor(DynamicFeatureExtractor): return process.image_name def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]: - # TODO (meh) - yield from [] + for thread in self.analysis.process_threads[ph.address.pid]: + address: ThreadAddress = ThreadAddress(process=ph.address, tid=thread) + yield ThreadHandle(address=address, inner={}) def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[Tuple[Feature, Address]]: - # force this routine to be a generator, - # but we don't actually have any elements to generate. - yield from [] + if False: + # force this routine to be a generator, + # but we don't actually have any elements to generate. + yield Characteristic("never"), NO_ADDRESS + return def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]: - # TODO (meh) - yield from [] + for function_call in self.analysis.process_calls[ph.address.pid][th.address.tid]: + addr = DynamicCallAddress(thread=th.address, id=int(function_call.fncall_id)) + yield CallHandle(address=addr, inner=function_call) def extract_call_features( self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle ) -> Iterator[Tuple[Feature, Address]]: # TODO (meh) - yield from [] + yield from capa.features.extractors.vmray.call.extract_features(ph, th, ch) def get_call_name(self, ph, th, ch) -> str: # TODO (meh) diff --git a/capa/features/extractors/vmray/file.py b/capa/features/extractors/vmray/file.py index 2c6463c1..8344f394 100644 --- a/capa/features/extractors/vmray/file.py +++ b/capa/features/extractors/vmray/file.py @@ -22,10 +22,10 @@ def get_processes(analysis: VMRayAnalysis) -> Iterator[ProcessHandle]: processes: Dict[str, Process] = analysis.sv2.processes for _, process in processes.items(): - pid = process.os_pid - ppid = processes[process.ref_parent_process.path[1]].os_pid if process.ref_parent_process else 0 + pid = process.monitor_id + ppid = processes[process.ref_parent_process.path[1]].monitor_id if process.ref_parent_process else 0 - addr = ProcessAddress(pid=pid, ppid=ppid) + addr = ProcessAddress(pid=int(pid), ppid=int(ppid)) yield ProcessHandle(address=addr, inner=process) diff --git a/scripts/show-features.py b/scripts/show-features.py index 30ad2a4b..b46432db 100644 --- a/scripts/show-features.py +++ b/scripts/show-features.py @@ -235,34 +235,34 @@ def print_dynamic_features(processes, extractor: DynamicFeatureExtractor): print(f" proc: {extractor.get_process_name(p)}: {feature}") - for t in extractor.get_threads(p): - print(f" thread: {t.address.tid}") - for feature, addr in extractor.extract_thread_features(p, t): + for t in extractor.get_threads(p): + print(f" thread: {t.address.tid}") + for feature, addr in extractor.extract_thread_features(p, t): + if is_global_feature(feature): + continue + + if feature != Feature(0): + print(f" {format_address(addr)}: {feature}") + + for call in extractor.get_calls(p, t): + apis = [] + arguments = [] + for feature, addr in extractor.extract_call_features(p, t, call): if is_global_feature(feature): continue - if feature != Feature(0): - print(f" {format_address(addr)}: {feature}") + if isinstance(feature, API): + assert isinstance(addr, capa.features.address.DynamicCallAddress) + apis.append((addr.id, str(feature.value))) - for call in extractor.get_calls(p, t): - apis = [] - arguments = [] - for feature, addr in extractor.extract_call_features(p, t, call): - if is_global_feature(feature): - continue + if isinstance(feature, (Number, String)): + arguments.append(str(feature.value)) - if isinstance(feature, API): - assert isinstance(addr, capa.features.address.DynamicCallAddress) - apis.append((addr.id, str(feature.value))) + # if not apis: + # print(f" arguments=[{', '.join(arguments)}]") - if isinstance(feature, (Number, String)): - arguments.append(str(feature.value)) - - if not apis: - print(f" arguments=[{', '.join(arguments)}]") - - for cid, api in apis: - print(f" call {cid}: {api}({', '.join(arguments)})") + for cid, api in apis: + print(f" call {cid}: {api}({', '.join(arguments)})") def ida_main():