vmray: add init support for ELF files

2025-12-12 15:49:46 -08:00 · 2024-07-18 17:52:33 -06:00
parent 24a31a8bc3
commit 8bf0d16fd8
3 changed files with 65 additions and 7 deletions
--- a/capa/features/extractors/vmray/init.py
+++ b/capa/features/extractors/vmray/init.py
@@ -69,10 +69,12 @@ class VMRayAnalysis:
            logger.warning("VMRay archive does not contain static data (file_type: %s)", self.file_type)
            raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)", self.file_type)

-        if not self.sample_file_static_data.pe:
-            logger.warning("VMRay feature extractor only supports PE at this time (file_type: %s)", self.file_type)
+        if not self.sample_file_static_data.pe and not self.sample_file_static_data.elf:
+            logger.warning(
+                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)", self.file_type
+            )
            raise UnsupportedFormatError(
-                "VMRay feature extractor only supports PE at this time(file_type: %s)", self.file_type
+                "VMRay feature extractor only supports PE and ELF at this time(file_type: %s)", self.file_type
            )

        # VMRay does not store static strings for the sample file so we must use the source file
@@ -126,8 +128,11 @@ class VMRayAnalysis:
    def _compute_sections(self):
        assert self.sample_file_static_data is not None
        if self.sample_file_static_data.pe:
-            for section in self.sample_file_static_data.pe.sections:
-                self.sections[section.virtual_address] = section.name
+            for pefile_section in self.sample_file_static_data.pe.sections:
+                self.sections[pefile_section.virtual_address] = pefile_section.name
+        elif self.sample_file_static_data.elf:
+            for elffile_section in self.sample_file_static_data.elf.sections:
+                self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name

    def _compute_process_ids(self):
        for process in self.sv2.processes.values():
--- a/capa/features/extractors/vmray/global_.py
+++ b/capa/features/extractors/vmray/global_.py
@@ -9,7 +9,18 @@
 import logging
 from typing import Tuple, Iterator

-from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Feature
+from capa.features.common import (
+    OS,
+    OS_LINUX,
+    ARCH_I386,
+    FORMAT_PE,
+    ARCH_AMD64,
+    FORMAT_ELF,
+    OS_WINDOWS,
+    Arch,
+    Format,
+    Feature,
+)
 from capa.features.address import NO_ADDRESS, Address
 from capa.features.extractors.vmray import VMRayAnalysis

@@ -32,6 +43,8 @@ def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]
    assert analysis.sample_file_static_data is not None
    if analysis.sample_file_static_data.pe:
        yield Format(FORMAT_PE), NO_ADDRESS
+    elif analysis.sample_file_static_data.elf:
+        yield Format(FORMAT_ELF), NO_ADDRESS
    else:
        logger.warning("unrecognized file format: %s", analysis.sv2.analysis_metadata.sample_type)
        raise ValueError(
@@ -44,6 +57,8 @@ def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:

    if "windows" in sample_type.lower():
        yield OS(OS_WINDOWS), NO_ADDRESS
+    elif "linux" in sample_type.lower():
+        yield OS(OS_LINUX), NO_ADDRESS
    else:
        logger.warning("unrecognized OS: %s", sample_type)
        raise ValueError(f"unrecognized OS from the VMRay report: {sample_type}")
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -72,6 +72,13 @@ def validate_param_list(value):
        return [value]


+def validate_call_name(value):
+    if value.startswith("sys_"):
+        return value[4:]
+    else:
+        return value
+
+
 # convert the input value to a Python int type before inner validation (int) is called
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]

@@ -98,13 +105,18 @@ class Params(BaseModel):
    params: ParamList = Field(alias="param")


+# call names may contain uneeded data so we remove that data before
+# the inner validation (str) is called
+CallName = Annotated[str, BeforeValidator(validate_call_name)]
+
+
 # models flog.xml files
 class FunctionCall(BaseModel):
    # ts: HexInt
    fncall_id: HexInt
    process_id: HexInt
    thread_id: HexInt
-    name: str
+    name: CallName
    # addr: HexInt
    # from_addr: HexInt = Field(alias="from")
    params_in: Params = Field(alias="in", default=None)
@@ -193,8 +205,34 @@ class PEFile(BaseModel):
    sections: List[PEFileSection] = []


+class ElfFileSectionHeader(BaseModel):
+    sh_name: str
+    sh_addr: int
+
+
+class ElfFileSection(BaseModel):
+    header: ElfFileSectionHeader
+
+
+"""
+class ElfFileHeader(BaseModel):
+    file_class: str
+    endianness: str
+    file_type: str
+    architecture: str
+    architecture_human_str: str
+    entry_point: int
+"""
+
+
+class ElfFile(BaseModel):
+    # file_header: ElfFileHeader
+    sections: List[ElfFileSection]
+
+
 class StaticData(BaseModel):
    pe: Optional[PEFile] = None
+    elf: Optional[ElfFile] = None


 class FileHashes(BaseModel):