Merge pull request #770 from fireeye/elffile-extractor

add light weight ElfFeatureExtractor
This commit is contained in:
Willi Ballenthin
2021-09-13 13:27:00 -06:00
committed by GitHub
12 changed files with 201 additions and 25 deletions

View File

@@ -71,3 +71,6 @@ ignore_missing_imports = True
[mypy-devtools.*]
ignore_missing_imports = True
[mypy-elftools.*]
ignore_missing_imports = True

View File

@@ -10,6 +10,7 @@
- rule format: add feature `os: ` for operating system, like `os: windows` #723 @williballenthin
- rule format: add feature `substring: ` for verbatim strings with leading/trailing wildcards #737 @williballenthin
- scripts: add `profile-memory.py` for profiling memory usage #736 @williballenthin
- main: add light weight ELF file feature extractor to detect file limitations #770 @mr-tz
### Breaking Changes

View File

@@ -344,7 +344,6 @@ VALID_ARCH = (ARCH_I386, ARCH_AMD64)
class Arch(Feature):
def __init__(self, value: str, description=None):
assert value in VALID_ARCH
super(Arch, self).__init__(value, description=description)
self.name = "arch"
@@ -358,7 +357,6 @@ VALID_OS.update({OS_WINDOWS, OS_LINUX, OS_MACOS})
class OS(Feature):
def __init__(self, value: str, description=None):
assert value in (VALID_OS)
super(OS, self).__init__(value, description=description)
self.name = "os"
@@ -370,7 +368,6 @@ VALID_FORMAT = (FORMAT_PE, FORMAT_ELF)
class Format(Feature):
def __init__(self, value: str, description=None):
assert value in (VALID_FORMAT)
super(Format, self).__init__(value, description=description)
self.name = "format"

View File

@@ -9,7 +9,7 @@
import abc
from typing import Tuple, Iterator, SupportsInt
from capa.features.basicblock import Feature
from capa.features.common import Feature
# feature extractors may reference functions, BBs, insns by opaque handle values.
# the only requirement of these handles are that they support `__int__`,

View File

@@ -5,13 +5,25 @@ import contextlib
import pefile
import capa.features
import capa.features.extractors.elf
import capa.features.extractors.pefile
from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format
from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, Arch, Format, String
logger = logging.getLogger(__name__)
def extract_file_strings(buf, **kwargs):
"""
extract ASCII and UTF-16 LE strings from file
"""
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
yield String(s.s), s.offset
for s in capa.features.extractors.strings.extract_unicode_strings(buf):
yield String(s.s), s.offset
def extract_format(buf):
if buf.startswith(b"MZ"):
yield Format(FORMAT_PE), 0x0
@@ -34,7 +46,7 @@ def extract_arch(buf):
with contextlib.closing(io.BytesIO(buf)) as f:
arch = capa.features.extractors.elf.detect_elf_arch(f)
if arch == "unknown":
if arch not in capa.features.common.VALID_ARCH:
logger.debug("unsupported arch: %s", arch)
return
@@ -62,7 +74,12 @@ def extract_os(buf):
with contextlib.closing(io.BytesIO(buf)) as f:
os = capa.features.extractors.elf.detect_elf_os(f)
if os not in capa.features.common.VALID_OS:
logger.debug("unsupported os: %s", os)
return
yield OS(os), 0x0
else:
# we likely end up here:
# 1. handling shellcode, or

View File

@@ -1,3 +1,10 @@
# Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import struct
import logging
from enum import Enum

View File

@@ -0,0 +1,142 @@
# Copyright (C) 2020 FireEye, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import io
import logging
import contextlib
from typing import Tuple
from elftools.elf.elffile import ELFFile, SymbolTableSection
import capa.features.extractors.common
from capa.features.file import Import, Section
from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature
from capa.features.extractors.elf import Arch as ElfArch
from capa.features.extractors.base_extractor import FeatureExtractor
logger = logging.getLogger(__name__)
def extract_file_import_names(elf, **kwargs):
# see https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/scripts/readelf.py#L372
symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) if isinstance(s, SymbolTableSection)]
for section_index, section in symbol_tables:
if not isinstance(section, SymbolTableSection):
continue
if section["sh_entsize"] == 0:
logger.debug("Symbol table '%s' has a sh_entsize of zero!" % (section.name))
continue
logger.debug("Symbol table '%s' contains %s entries:" % (section.name, section.num_symbols()))
for nsym, symbol in enumerate(section.iter_symbols()):
if symbol.name and symbol.entry.st_info.type == "STT_FUNC":
# TODO symbol address
# TODO symbol version info?
yield Import(symbol.name), 0x0
def extract_file_section_names(elf, **kwargs):
for section in elf.iter_sections():
if section.name:
yield Section(section.name), section.header.sh_addr
elif section.is_null():
yield Section("NULL"), section.header.sh_addr
def extract_file_strings(buf, **kwargs):
yield from capa.features.extractors.common.extract_file_strings(buf)
def extract_file_os(elf, buf, **kwargs):
# our current approach does not always get an OS value, e.g. for packed samples
# for file limitation purposes, we're more lax here
try:
os = next(capa.features.extractors.common.extract_os(buf))
yield os
except StopIteration:
yield OS("unknown"), 0x0
def extract_file_format(**kwargs):
yield Format(FORMAT_ELF), 0x0
def extract_file_arch(elf, **kwargs):
# TODO merge with capa.features.extractors.elf.detect_elf_arch()
arch = elf.get_machine_arch()
if arch == "x86":
yield Arch(ElfArch.I386), 0x0
elif arch == "x64":
yield Arch(ElfArch.AMD64), 0x0
else:
logger.warning("unsupported architecture: %s", arch)
def extract_file_features(elf: ELFFile, buf: bytes) -> Tuple[Feature, int]:
for file_handler in FILE_HANDLERS:
for feature, va in file_handler(elf=elf, buf=buf):
yield feature, va
FILE_HANDLERS = (
# TODO extract_file_export_names,
extract_file_import_names,
extract_file_section_names,
extract_file_strings,
# no library matching
extract_file_os,
extract_file_format,
extract_file_arch,
)
class ElfFeatureExtractor(FeatureExtractor):
def __init__(self, path: str):
super(ElfFeatureExtractor, self).__init__()
self.path = path
with open(self.path, "rb") as f:
self.elf = ELFFile(io.BytesIO(f.read()))
def get_base_address(self):
# virtual address of the first segment with type LOAD
for segment in self.elf.iter_segments():
if segment.header.p_type == "PT_LOAD":
return segment.header.p_vaddr
def extract_file_features(self):
with open(self.path, "rb") as f:
buf = f.read()
for feature, va in extract_file_features(self.elf, buf):
yield feature, va
def get_functions(self):
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
def extract_function_features(self, f):
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
def get_basic_blocks(self, f):
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
def extract_basic_block_features(self, f, bb):
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
def get_instructions(self, f, bb):
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
def extract_insn_features(self, f, bb, insn):
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
def is_library_function(self, va):
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
def get_function_name(self, va):
raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")

View File

@@ -5,16 +5,18 @@
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import logging
import pefile
import capa.features.common
import capa.features.extractors
import capa.features.extractors.common
import capa.features.extractors.helpers
import capa.features.extractors.strings
from capa.features.file import Export, Import, Section
from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, String, Characteristic
from capa.features.common import OS, ARCH_I386, FORMAT_PE, ARCH_AMD64, OS_WINDOWS, Arch, Format, Characteristic
from capa.features.extractors.base_extractor import FeatureExtractor
logger = logging.getLogger(__name__)
@@ -85,14 +87,7 @@ def extract_file_section_names(pe, **kwargs):
def extract_file_strings(buf, **kwargs):
"""
extract ASCII and UTF-16 LE strings from file
"""
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
yield String(s.s), s.offset
for s in capa.features.extractors.strings.extract_unicode_strings(buf):
yield String(s.s), s.offset
yield from capa.features.extractors.common.extract_file_strings(buf)
def extract_file_function_names(**kwargs):

View File

@@ -68,14 +68,7 @@ def extract_file_section_names(vw, **kwargs):
def extract_file_strings(buf, **kwargs):
"""
extract ASCII and UTF-16 LE strings from file
"""
for s in capa.features.extractors.strings.extract_ascii_strings(buf):
yield String(s.s), s.offset
for s in capa.features.extractors.strings.extract_unicode_strings(buf):
yield String(s.s), s.offset
yield from capa.features.extractors.common.extract_file_strings(buf)
def extract_file_function_names(vw, **kwargs):

View File

@@ -39,6 +39,7 @@ import capa.render.vverbose
import capa.features.extractors
import capa.features.extractors.common
import capa.features.extractors.pefile
import capa.features.extractors.elffile
from capa.rules import Rule, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.helpers import get_file_taste
@@ -945,9 +946,10 @@ def main(argv=None):
logger.error("%s", str(e))
return -1
file_extractor = None
if args.format == "pe" or (args.format == "auto" and taste.startswith(b"MZ")):
# this pefile file feature extractor is pretty light weight: it doesn't do any code analysis.
# so we can fairly quickly determine if the given PE file has "pure" file-scope rules
# these pefile and elffile file feature extractors are pretty light weight: they don't do any code analysis.
# so we can fairly quickly determine if the given file has "pure" file-scope rules
# that indicate a limitation (like "file is packed based on section names")
# and avoid doing a full code analysis on difficult/impossible binaries.
try:
@@ -957,6 +959,17 @@ def main(argv=None):
except PEFormatError as e:
logger.error("Input file '%s' is not a valid PE file: %s", args.sample, str(e))
return -1
elif args.format == "elf" or (args.format == "auto" and taste.startswith(b"\x7fELF")):
try:
from elftools.common.exceptions import ELFError
file_extractor = capa.features.extractors.elffile.ElfFeatureExtractor(args.sample)
except ELFError as e:
logger.error("Input file '%s' is not a valid ELF file: %s", args.sample, str(e))
return -1
if file_extractor:
pure_file_capabilities, _ = find_file_capabilities(rules, file_extractor, {})
# file limitations that rely on non-file scope won't be detected here.

View File

@@ -260,6 +260,7 @@ def parse_feature(key: str):
elif key == "format":
return capa.features.common.Format
elif key == "arch":
return capa.features.common.Arch
else:
raise InvalidRule("unexpected statement: %s" % key)
@@ -471,6 +472,12 @@ def build_statements(d, scope: str):
raise InvalidRule("unexpected range: %s" % (count))
elif key == "string" and not isinstance(d[key], str):
raise InvalidRule("ambiguous string value %s, must be defined as explicit string" % d[key])
elif (
(key == "os" and d[key] not in capa.features.common.VALID_OS)
or (key == "format" and d[key] not in capa.features.common.VALID_FORMAT)
or (key == "arch" and d[key] not in capa.features.common.VALID_ARCH)
):
raise InvalidRule("unexpected %s value %s" % (key, d[key]))
else:
Feature = parse_feature(key)
value, description = parse_description(d[key], key, d.get("description"))

View File

@@ -26,6 +26,7 @@ requirements = [
"smda==1.6.2",
"pefile==2021.9.3",
"typing==3.7.4.3",
"pyelftools==0.27",
]
# this sets __version__