Merge pull request #1762 from yelhamer/modify-sample-hashes

Modify sample hashes
This commit is contained in:
Yacine
2023-08-25 10:29:38 +03:00
committed by GitHub
10 changed files with 24 additions and 58 deletions

View File

@@ -106,13 +106,14 @@ class StaticFeatureExtractor:
__metaclass__ = abc.ABCMeta
def __init__(self):
def __init__(self, hashes: SampleHashes):
#
# note: a subclass should define ctor parameters for its own use.
# for example, the Vivisect feature extract might require the vw and/or path.
# this base class doesn't know what to do with that info, though.
#
super().__init__()
self._sample_hashes = hashes
@abc.abstractmethod
def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]:
@@ -130,7 +131,7 @@ class StaticFeatureExtractor:
"""
fetch the hashes for the sample contained within the extractor.
"""
raise NotImplementedError()
return self._sample_hashes
@abc.abstractmethod
def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
@@ -353,20 +354,21 @@ class DynamicFeatureExtractor:
__metaclass__ = abc.ABCMeta
def __init__(self):
def __init__(self, hashes: SampleHashes):
#
# note: a subclass should define ctor parameters for its own use.
# for example, the Vivisect feature extract might require the vw and/or path.
# this base class doesn't know what to do with that info, though.
#
super().__init__()
self._sample_hashes = hashes
@abc.abstractmethod
def get_sample_hashes(self) -> SampleHashes:
"""
fetch the hashes for the sample contained within the extractor.
"""
raise NotImplementedError()
return self._sample_hashes
@abc.abstractmethod
def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:

View File

@@ -29,20 +29,16 @@ from capa.features.extractors.base_extractor import (
class BinjaFeatureExtractor(StaticFeatureExtractor):
def __init__(self, bv: binja.BinaryView):
super().__init__()
super().__init__(hashes=SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes()))
self.bv = bv
self.global_features: List[Tuple[Feature, Address]] = []
self.global_features.extend(capa.features.extractors.binja.file.extract_file_format(self.bv))
self.global_features.extend(capa.features.extractors.binja.global_.extract_os(self.bv))
self.global_features.extend(capa.features.extractors.binja.global_.extract_arch(self.bv))
self.sample_hashes = SampleHashes.from_bytes(Path(bv.file.original_filename).read_bytes())
def get_base_address(self):
return AbsoluteVirtualAddress(self.bv.start)
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_global_features(self):
yield from self.global_features

View File

@@ -33,15 +33,14 @@ TESTED_VERSIONS = {"2.2-CAPE", "2.4-CAPE"}
class CapeExtractor(DynamicFeatureExtractor):
def __init__(self, report: CapeReport):
super().__init__()
self.report: CapeReport = report
self.sample_hashes = SampleHashes(
md5=self.report.target.file.md5.lower(),
sha1=self.report.target.file.sha1.lower(),
sha256=self.report.target.file.sha256.lower(),
super().__init__(
hashes=SampleHashes(
md5=report.target.file.md5.lower(),
sha1=report.target.file.sha1.lower(),
sha256=report.target.file.sha256.lower(),
)
)
self.report: CapeReport = report
self.global_features = capa.features.extractors.cape.global_.extract_features(self.report)
def get_base_address(self) -> Union[AbsoluteVirtualAddress, _NoAddress, None]:
@@ -49,9 +48,6 @@ class CapeExtractor(DynamicFeatureExtractor):
assert self.report.static is not None and self.report.static.pe is not None
return AbsoluteVirtualAddress(self.report.static.pe.imagebase)
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]:
yield from self.global_features

View File

@@ -76,9 +76,8 @@ class DnFileFeatureExtractorCache:
class DnfileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: Path):
super().__init__()
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
self.sample_hashes = SampleHashes.from_bytes(path.read_bytes())
super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes()))
# pre-compute .NET token lookup tables; each .NET method has access to this cache for feature extraction
# most relevant at instruction scope
@@ -93,9 +92,6 @@ class DnfileFeatureExtractor(StaticFeatureExtractor):
def get_base_address(self):
return NO_ADDRESS
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_global_features(self):
yield from self.global_features

View File

@@ -83,17 +83,13 @@ GLOBAL_HANDLERS = (
class DnfileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: Path):
super().__init__()
super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes()))
self.path: Path = path
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes())
def get_base_address(self) -> AbsoluteVirtualAddress:
return AbsoluteVirtualAddress(0x0)
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def get_entry_point(self) -> int:
# self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT
# True: native EP: Token

View File

@@ -167,17 +167,13 @@ GLOBAL_HANDLERS = (
class DotnetFileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: Path):
super().__init__()
super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes()))
self.path: Path = path
self.pe: dnfile.dnPE = dnfile.dnPE(str(path))
self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes())
def get_base_address(self):
return NO_ADDRESS
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def get_entry_point(self) -> int:
# self.pe.net.Flags.CLT_NATIVE_ENTRYPOINT
# True: native EP: Token

View File

@@ -30,21 +30,19 @@ from capa.features.extractors.base_extractor import (
class IdaFeatureExtractor(StaticFeatureExtractor):
def __init__(self):
super().__init__()
super().__init__(
hashes=SampleHashes(
md5=ida_nalt.retrieve_input_file_md5(), sha1="(unknown)", sha256=ida_nalt.retrieve_input_file_sha256()
)
)
self.global_features: List[Tuple[Feature, Address]] = []
self.global_features.extend(capa.features.extractors.ida.file.extract_file_format())
self.global_features.extend(capa.features.extractors.ida.global_.extract_os())
self.global_features.extend(capa.features.extractors.ida.global_.extract_arch())
self.sample_hashes = SampleHashes(
md5=ida_nalt.retrieve_input_file_md5(), sha1="(unknown)", sha256=ida_nalt.retrieve_input_file_sha256()
)
def get_base_address(self):
return AbsoluteVirtualAddress(idaapi.get_imagebase())
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_global_features(self):
yield from self.global_features

View File

@@ -63,9 +63,6 @@ class NullStaticFeatureExtractor(StaticFeatureExtractor):
for feature in self.global_features:
yield feature, NO_ADDRESS
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_file_features(self):
for address, feature in self.file_features:
yield feature, address
@@ -124,9 +121,6 @@ class NullDynamicFeatureExtractor(DynamicFeatureExtractor):
for feature in self.global_features:
yield feature, NO_ADDRESS
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_file_features(self):
for address, feature in self.file_features:
yield feature, address

View File

@@ -187,17 +187,13 @@ GLOBAL_HANDLERS = (
class PefileFeatureExtractor(StaticFeatureExtractor):
def __init__(self, path: Path):
super().__init__()
super().__init__(hashes=SampleHashes.from_bytes(path.read_bytes()))
self.path: Path = path
self.pe = pefile.PE(str(path))
self.sample_hashes = SampleHashes.from_bytes(self.path.read_bytes())
def get_base_address(self):
return AbsoluteVirtualAddress(self.pe.OPTIONAL_HEADER.ImageBase)
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_global_features(self):
buf = Path(self.path).read_bytes()

View File

@@ -33,11 +33,10 @@ logger = logging.getLogger(__name__)
class VivisectFeatureExtractor(StaticFeatureExtractor):
def __init__(self, vw, path: Path, os):
super().__init__()
self.vw = vw
self.path = path
self.buf = path.read_bytes()
self.sample_hashes = SampleHashes.from_bytes(self.buf)
super().__init__(hashes=SampleHashes.from_bytes(self.buf))
# pre-compute these because we'll yield them at *every* scope.
self.global_features: List[Tuple[Feature, Address]] = []
@@ -49,9 +48,6 @@ class VivisectFeatureExtractor(StaticFeatureExtractor):
# assume there is only one file loaded into the vw
return AbsoluteVirtualAddress(list(self.vw.filemeta.values())[0]["imagebase"])
def get_sample_hashes(self) -> SampleHashes:
return self.sample_hashes
def extract_global_features(self):
yield from self.global_features