From 8bf0d16fd8008c6a3c28f06279065bc120f11b9e Mon Sep 17 00:00:00 2001
From: Mike Hunhoff <mike.hunhoff@gmail.com>
Date: Thu, 18 Jul 2024 17:52:33 -0600
Subject: [PATCH] vmray: add init support for ELF files

---
 capa/features/extractors/vmray/__init__.py | 15 +++++---
 capa/features/extractors/vmray/global_.py  | 17 ++++++++-
 capa/features/extractors/vmray/models.py   | 40 +++++++++++++++++++++-
 3 files changed, 65 insertions(+), 7 deletions(-)

diff --git a/capa/features/extractors/vmray/__init__.py b/capa/features/extractors/vmray/__init__.py
index 27906d83..141a2595 100644
--- a/capa/features/extractors/vmray/__init__.py
+++ b/capa/features/extractors/vmray/__init__.py
@@ -69,10 +69,12 @@ class VMRayAnalysis:
             logger.warning("VMRay archive does not contain static data (file_type: %s)", self.file_type)
             raise UnsupportedFormatError("VMRay archive does not contain static data (file_type: %s)", self.file_type)
 
-        if not self.sample_file_static_data.pe:
-            logger.warning("VMRay feature extractor only supports PE at this time (file_type: %s)", self.file_type)
+        if not self.sample_file_static_data.pe and not self.sample_file_static_data.elf:
+            logger.warning(
+                "VMRay feature extractor only supports PE and ELF at this time (file_type: %s)", self.file_type
+            )
             raise UnsupportedFormatError(
-                "VMRay feature extractor only supports PE at this time(file_type: %s)", self.file_type
+                "VMRay feature extractor only supports PE and ELF at this time(file_type: %s)", self.file_type
             )
 
         # VMRay does not store static strings for the sample file so we must use the source file
@@ -126,8 +128,11 @@ class VMRayAnalysis:
     def _compute_sections(self):
         assert self.sample_file_static_data is not None
         if self.sample_file_static_data.pe:
-            for section in self.sample_file_static_data.pe.sections:
-                self.sections[section.virtual_address] = section.name
+            for pefile_section in self.sample_file_static_data.pe.sections:
+                self.sections[pefile_section.virtual_address] = pefile_section.name
+        elif self.sample_file_static_data.elf:
+            for elffile_section in self.sample_file_static_data.elf.sections:
+                self.sections[elffile_section.header.sh_addr] = elffile_section.header.sh_name
 
     def _compute_process_ids(self):
         for process in self.sv2.processes.values():
diff --git a/capa/features/extractors/vmray/global_.py b/capa/features/extractors/vmray/global_.py
index 69f91bf0..82ab2458 100644
--- a/capa/features/extractors/vmray/global_.py
+++ b/capa/features/extractors/vmray/global_.py
@@ -9,7 +9,18 @@
 import logging
 from typing import Tuple, Iterator
 
-from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Feature
+from capa.features.common import (
+    OS,
+    OS_LINUX,
+    ARCH_I386,
+    FORMAT_PE,
+    ARCH_AMD64,
+    FORMAT_ELF,
+    OS_WINDOWS,
+    Arch,
+    Format,
+    Feature,
+)
 from capa.features.address import NO_ADDRESS, Address
 from capa.features.extractors.vmray import VMRayAnalysis
 
@@ -32,6 +43,8 @@ def extract_format(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]
     assert analysis.sample_file_static_data is not None
     if analysis.sample_file_static_data.pe:
         yield Format(FORMAT_PE), NO_ADDRESS
+    elif analysis.sample_file_static_data.elf:
+        yield Format(FORMAT_ELF), NO_ADDRESS
     else:
         logger.warning("unrecognized file format: %s", analysis.sv2.analysis_metadata.sample_type)
         raise ValueError(
@@ -44,6 +57,8 @@ def extract_os(analysis: VMRayAnalysis) -> Iterator[Tuple[Feature, Address]]:
 
     if "windows" in sample_type.lower():
         yield OS(OS_WINDOWS), NO_ADDRESS
+    elif "linux" in sample_type.lower():
+        yield OS(OS_LINUX), NO_ADDRESS
     else:
         logger.warning("unrecognized OS: %s", sample_type)
         raise ValueError(f"unrecognized OS from the VMRay report: {sample_type}")
diff --git a/capa/features/extractors/vmray/models.py b/capa/features/extractors/vmray/models.py
index 4291e7d0..9d2bd271 100644
--- a/capa/features/extractors/vmray/models.py
+++ b/capa/features/extractors/vmray/models.py
@@ -72,6 +72,13 @@ def validate_param_list(value):
         return [value]
 
 
+def validate_call_name(value):
+    if value.startswith("sys_"):
+        return value[4:]
+    else:
+        return value
+
+
 # convert the input value to a Python int type before inner validation (int) is called
 HexInt = Annotated[int, BeforeValidator(validate_hex_int)]
 
@@ -98,13 +105,18 @@ class Params(BaseModel):
     params: ParamList = Field(alias="param")
 
 
+# call names may contain uneeded data so we remove that data before
+# the inner validation (str) is called
+CallName = Annotated[str, BeforeValidator(validate_call_name)]
+
+
 # models flog.xml files
 class FunctionCall(BaseModel):
     # ts: HexInt
     fncall_id: HexInt
     process_id: HexInt
     thread_id: HexInt
-    name: str
+    name: CallName
     # addr: HexInt
     # from_addr: HexInt = Field(alias="from")
     params_in: Params = Field(alias="in", default=None)
@@ -193,8 +205,34 @@ class PEFile(BaseModel):
     sections: List[PEFileSection] = []
 
 
+class ElfFileSectionHeader(BaseModel):
+    sh_name: str
+    sh_addr: int
+
+
+class ElfFileSection(BaseModel):
+    header: ElfFileSectionHeader
+
+
+"""
+class ElfFileHeader(BaseModel):
+    file_class: str
+    endianness: str
+    file_type: str
+    architecture: str
+    architecture_human_str: str
+    entry_point: int
+"""
+
+
+class ElfFile(BaseModel):
+    # file_header: ElfFileHeader
+    sections: List[ElfFileSection]
+
+
 class StaticData(BaseModel):
     pe: Optional[PEFile] = None
+    elf: Optional[ElfFile] = None
 
 
 class FileHashes(BaseModel):