render: add initial proto generator

This commit is contained in:
Willi Ballenthin
2023-02-14 10:02:12 +01:00
parent 514b4929b3
commit 38d8b7f501
4 changed files with 1011 additions and 0 deletions

View File

@@ -0,0 +1,445 @@
# Copyright (C) 2023 FireEye, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import io
import sys
from typing import Dict, Union
from dataclasses import dataclass
import pydantic
import capa.render
import capa.render.utils
import capa.features.freeze
import capa.render.result_document
import capa.features.freeze.features
from capa.render.utils import StringIO
def emit_proto_enum(out: StringIO, enum):
# like: AddressType
title = enum["title"]
# like: ADDRESSTYPE
prefix = title.upper()
def render_value(value):
# like: ADDRESSTYPE_ABSOLUTE
return "%s_%s" % (prefix, value.upper().replace(" ", "_"))
# like:
#
# enum AddressType {
# ADDRESSTYPE_UNSPECIFIED = 0;
# ADDRESSTYPE_ABSOLUTE = 1;
# ADDRESSTYPE_RELATIVE = 2;
# ...
# }
out.writeln(f"enum {title} {{")
out.writeln(f' {render_value("unspecified")} = 0;')
for i, value in enumerate(enum["enum"]):
out.writeln(f" {render_value(value)} = {i + 1};")
out.writeln(f"}}")
out.writeln("")
def is_ref(prop):
return "$ref" in prop
def get_ref_type_name(prop):
# from: {"$ref": "#/definitions/Scope"}},
# to: "Scope"
assert is_ref(prop)
assert prop["$ref"].startswith("#/definitions/")
return prop["$ref"][len("#/definitions/") :]
def is_primitive_type(prop):
# things like: string, integer, bool, etc.
return "type" in prop and not prop["type"] == "object"
def is_custom_type(prop):
# struct-like things defined in the schema, like Features, etc.
return "type" in prop and prop["type"] == "object" and "additionalProperties" not in prop
def get_custom_type_name(prop):
return prop["title"]
def is_tuple(prop):
# a tuple is an array with a fixed size.
# the types of the elements can vary.
# we'll emit a custom message type for each tuple, like Pair_Address_Match.
#
# like:
#
# {"items": [{"$ref": "#/definitions/Address"},
# {"$ref": "#/definitions/Match"}],
# "maxItems": 2,
# "minItems": 2,
# "type": "array"},
if "type" not in prop:
return False
if prop["type"] != "array":
return False
if "maxItems" not in prop or "minItems" not in prop:
return False
if prop["maxItems"] != prop["minItems"]:
# tuples have a fixed size
return False
return True
def get_tuple_type_name(prop):
assert is_tuple(prop)
if prop["maxItems"] == 2:
base = "Pair"
else:
base = "Tuple"
# this won't work for nested tuples, but good enough for here.
return base + "_" + "_".join(get_type_name(item) for item in prop["items"])
def is_array(prop):
# an array is a sequence of elements of the same type.
# typically we can use a repeated field for this.
# note: there's a special case within maps, where the array elements are a custom wrapper type.
#
# like:
#
# {"items": {"type": "string"},
# "title": "Parts",
# "type": "array"},
if "type" not in prop:
return False
if prop["type"] != "array":
return False
if "maxItems" in prop and "minItems" in prop and prop["maxItems"] == prop["minItems"]:
# tuples have a fixed size, arrays are variable
return False
if not isinstance(prop["items"], dict):
# array elements have a fixed type
return False
return True
def is_map(prop):
# a map maps from string key to a fixed type.
# the value type cannot be repeated, so we'll emit a custom wrapper type.
#
# like:
#
# {"additionalProperties": {"items": {"$ref": "#/definitions/Address"},
# "type": "array"},
# "title": "Captures",
# "type": "object"},
return "type" in prop and prop["type"] == "object" and "additionalProperties" in prop
def get_primitive_type_name(prop):
assert is_primitive_type(prop)
if prop["type"] == "string":
return "string"
elif prop["type"] == "boolean":
return "bool"
elif prop["type"] == "integer":
# this integer has arbitrary range.
# but proto supports only i64 and u64.
# so we hook this specially, including within the translator.
return "Integer"
elif prop["type"] == "number":
# number: int | float
# we hook this specially
return "Number"
elif is_tuple(prop):
return get_tuple_type_name(prop)
elif is_array(prop):
aitem = prop["items"]
if is_primitive_type(aitem):
atype = get_primitive_type_name(prop["items"])
elif is_ref(aitem):
atype = get_ref_type_name(aitem)
elif is_custom_type(aitem):
atype = get_custom_type_name(aitem)
else:
raise NotImplementedError(aitem)
return f"repeated {atype}"
else:
raise NotImplementedError(prop["type"])
def get_type_name(prop):
if is_primitive_type(prop):
return get_primitive_type_name(prop)
elif is_custom_type(prop):
return get_custom_type_name(prop)
elif is_ref(prop):
return get_ref_type_name(prop)
else:
raise NotImplementedError(prop)
def is_union(prop):
# a union is a field that can be one of several types.
return "anyOf" in prop
def sanitize_prop_name(name):
# like: "analysis-conclusion" -> "analysis_conclusion"
# like: "att&ck" -> "attack"
# like: "capa/subscope" -> "capa-subscope"
# like: "function name" -> "function-name"
return name.replace("-", "_").replace("&", "a").replace("/", "_").replace(" ", "_")
def _find_capa_class(name):
# try to find the capa class that corresponds to the given name.
# we use this to find the class that defines the property order.
try:
return getattr(capa.render.result_document, name)
except AttributeError:
pass
try:
return getattr(capa.features.freeze, name)
except AttributeError:
pass
try:
return getattr(capa.features.freeze.features, name)
except AttributeError:
pass
raise NotImplementedError(name)
def _enum_properties(message):
"""enumerate the properties of the message definitioned, ordered by class declaration"""
# this is just for convenience.
# the order of properties provided by the class. guaranteed.
property_order = list(_find_capa_class(message["title"]).__signature__.parameters.keys())
# order of properties provided by pydantic. not guaranteed. the fallback.
# used when we can't figure out an alias, such as capa/subscope -> is_subscope.
properties = list(message["properties"].keys())
def get_property_index(name):
try:
# prefer the order of properties provided by the class.
return property_order.index(sanitize_prop_name(name))
except ValueError:
# fallback to whatever pydantic extracts.
return len(message["properties"]) + properties.index(name)
return sorted(message["properties"].items(), key=lambda p: get_property_index(p[0]))
@dataclass
class DeferredArrayType:
name: str
item: dict
@dataclass
class DeferredTupleType:
name: str
count: int
items: dict
def emit_proto_message(out: StringIO, deferred_types: Dict, message):
# like: Address
title = message["title"]
out.writeln(f"message {title} {{")
counter = iter(range(1, sys.maxsize))
for raw_name, prop in _enum_properties(message):
# we use a counter like this so that
# union/oneof fields can increment the counter.
i = next(counter)
name = sanitize_prop_name(raw_name)
if is_ref(prop):
ptype = get_ref_type_name(prop)
out.writeln(f" {ptype} {name} = {i};")
elif is_primitive_type(prop):
ptype = get_primitive_type_name(prop)
out.writeln(f" {ptype} {name} = {i};")
if is_tuple(prop):
deferred_types[ptype] = DeferredTupleType(ptype, prop["minItems"], prop["items"])
elif is_array(prop):
aitem = prop["items"]
if is_tuple(aitem):
atype = get_tuple_type_name(aitem)
deferred_types[atype] = DeferredTupleType(atype, aitem["minItems"], aitem["items"])
elif is_custom_type(prop):
ptype = get_custom_type_name(prop)
out.writeln(f" {ptype} {name} = {i};")
elif is_union(prop):
out.writeln(f" oneof {name} {{")
for j, of in enumerate(prop["anyOf"]):
if is_ref(of):
ptype = get_ref_type_name(of)
out.writeln(f" {ptype} v{j} = {i};")
elif is_primitive_type(of):
ptype = get_primitive_type_name(of)
out.writeln(f" {ptype} v{j} = {i};")
if is_tuple(of):
deferred_types[ptype] = DeferredTupleType(ptype, of["minItems"], of["items"])
# pydantic doesn't seem to encode None option
# fortunately, neither does protobuf.
# still seems weird not to be explicit.
else:
raise NotImplementedError(of)
i = next(counter)
out.writeln(f" }};")
elif is_map(prop):
if is_array(prop["additionalProperties"]):
# map values cannot be repeated, see:
# https://stackoverflow.com/a/41552990/87207
#
# so create a wrapper type around the repeated values.
# like: message Array_Integer { repeated int32 values = 1; }
#
# no:
#
# map <string, repeated int32> things = 1;
#
# yes:
#
# map <string, Array_Integer> things = 1;
#
# we could do this for every array, like Array_Integer and Array_Address,
# but its less idiomatic and more noisy.
# so we only create these types when we need them.
item_def = prop["additionalProperties"]["items"]
vtype = "Array_" + get_type_name(item_def)
# register this type to be emitted once we're done with the
# top level custom types in the schema.
deferred_types[vtype] = DeferredArrayType(vtype, item_def)
else:
vtype = get_type_name(prop["additionalProperties"])
out.writeln(f" map <string, {vtype}> {name} = {i};")
else:
raise ValueError("unexpected type: %s" % prop)
out.writeln(f"}}")
out.writeln("")
def emit_proto_entry(out: StringIO, deferred_types: Dict, schema, name):
if not name.startswith("#/definitions/"):
raise ValueError("unexpected name: %s" % name)
title = name[len("#/definitions/") :]
definition = schema["definitions"][title]
if definition["title"] != title:
raise ValueError("title mismatch: %s" % definition["title"])
if definition["type"] == "string" and "enum" in definition:
emit_proto_enum(out, definition)
elif definition["type"] == "object":
emit_proto_message(out, deferred_types, definition)
else:
raise NotImplementedError(definition["type"])
def generate_proto_from_pydantic(schema):
out: StringIO = capa.render.utils.StringIO()
out.writeln("// Generated by the capa.render.proto translator. DO NOT EDIT!")
out.writeln('syntax = "proto3";')
out.writeln("")
deferred_types: Dict[str, Union[DeferredArrayType, DeferredTupleType]] = dict()
for name in sorted(schema["definitions"].keys()):
emit_proto_entry(out, deferred_types, schema, "#/definitions/" + name)
for name, deferred_type in sorted(deferred_types.items()):
if isinstance(deferred_type, DeferredArrayType):
vtype = get_type_name(deferred_type.item)
out.writeln(f"message {name} {{ repeated {vtype} values = 1; }}\n")
elif isinstance(deferred_type, DeferredTupleType):
out.writeln(f"message {name} {{")
for i, item in enumerate(deferred_type.items):
vtype = get_type_name(item)
out.writeln(f" {vtype} v{i} = {i + 1};")
out.writeln(f"}}\n")
# these are additional primitive types that we'll use throughout.
out.writeln("message Integer { oneof value { uint64 u = 1; int64 i = 2; } }\n")
out.writeln("message Number { oneof value { uint64 u = 1; int64 i = 2; double f = 3; } }\n")
return out.getvalue()
def generate_proto() -> str:
"""
generate a protobuf v3 schema for the ResultDocument format.
we use introspection of the pydantic schema to generate this.
note: we *cannot* use the generated proto from version to version of capa,
because this translator does guarantee field ordering/numbering.
that is, if we add a new property to any of the pydantic models,
the proto field numbers may change, and any clients using the proto will break.
instead, we should use this method to generate the proto,
probably once per major version,
and then commit the proto to the repo.
"""
return generate_proto_from_pydantic(pydantic.schema_of(capa.render.result_document.ResultDocument))

View File

@@ -0,0 +1,392 @@
// Generated by the capa.render.proto translator. DO NOT EDIT!
syntax = "proto3";
message APIFeature {
string type = 1;
string api = 2;
string description = 3;
}
message Address {
AddressType type = 1;
oneof value {
Integer v0 = 2;
Pair_Integer_Integer v1 = 3;
};
}
enum AddressType {
ADDRESSTYPE_UNSPECIFIED = 0;
ADDRESSTYPE_ABSOLUTE = 1;
ADDRESSTYPE_RELATIVE = 2;
ADDRESSTYPE_FILE = 3;
ADDRESSTYPE_DN_TOKEN = 4;
ADDRESSTYPE_DN_TOKEN_OFFSET = 5;
ADDRESSTYPE_NO_ADDRESS = 6;
}
message Analysis {
string format = 1;
string arch = 2;
string os = 3;
string extractor = 4;
repeated string rules = 5;
Address base_address = 6;
Layout layout = 7;
FeatureCounts feature_counts = 8;
repeated LibraryFunction library_functions = 9;
}
message ArchFeature {
string type = 1;
string arch = 2;
string description = 3;
}
message AttackSpec {
repeated string parts = 1;
string tactic = 2;
string technique = 3;
string subtechnique = 4;
string id = 5;
}
message BasicBlockFeature {
string type = 1;
string description = 2;
}
message BasicBlockLayout {
Address address = 1;
}
message BytesFeature {
string type = 1;
string bytes = 2;
string description = 3;
}
message CharacteristicFeature {
string type = 1;
string characteristic = 2;
string description = 3;
}
message ClassFeature {
string type = 1;
string description = 2;
string class = 3;
}
message CompoundStatement {
string type = 1;
string description = 2;
}
message ExportFeature {
string type = 1;
string export = 2;
string description = 3;
}
message FeatureCounts {
Integer file = 1;
repeated FunctionFeatureCount functions = 2;
}
message FeatureNode {
oneof feature {
OSFeature v0 = 1;
ArchFeature v1 = 2;
FormatFeature v2 = 3;
MatchFeature v3 = 4;
CharacteristicFeature v4 = 5;
ExportFeature v5 = 6;
ImportFeature v6 = 7;
SectionFeature v7 = 8;
FunctionNameFeature v8 = 9;
SubstringFeature v9 = 10;
RegexFeature v10 = 11;
StringFeature v11 = 12;
ClassFeature v12 = 13;
NamespaceFeature v13 = 14;
APIFeature v14 = 15;
PropertyFeature v15 = 16;
NumberFeature v16 = 17;
BytesFeature v17 = 18;
OffsetFeature v18 = 19;
MnemonicFeature v19 = 20;
OperandNumberFeature v20 = 21;
OperandOffsetFeature v21 = 22;
BasicBlockFeature v22 = 23;
};
string type = 25;
}
message FormatFeature {
string type = 1;
string format = 2;
string description = 3;
}
message FunctionFeatureCount {
Address address = 1;
Integer count = 2;
}
message FunctionLayout {
Address address = 1;
repeated BasicBlockLayout matched_basic_blocks = 2;
}
message FunctionNameFeature {
string type = 1;
string function_name = 2;
string description = 3;
}
message ImportFeature {
string type = 1;
string description = 2;
string import = 3;
}
message Layout {
repeated FunctionLayout functions = 1;
}
message LibraryFunction {
Address address = 1;
string name = 2;
}
message MBCSpec {
repeated string parts = 1;
string objective = 2;
string behavior = 3;
string method = 4;
string id = 5;
}
message MaecMetadata {
string analysis_conclusion = 1;
string analysis_conclusion_ov = 2;
string malware_family = 3;
string malware_category = 4;
string malware_category_ov = 5;
}
message Match {
bool success = 1;
oneof node {
StatementNode v0 = 2;
FeatureNode v1 = 3;
};
repeated Match children = 5;
repeated Address locations = 6;
map <string, Array_Address> captures = 7;
}
message MatchFeature {
string type = 1;
string match = 2;
string description = 3;
}
message Metadata {
string timestamp = 1;
string version = 2;
repeated string argv = 3;
Sample sample = 4;
Analysis analysis = 5;
}
message MnemonicFeature {
string type = 1;
string mnemonic = 2;
string description = 3;
}
message NamespaceFeature {
string type = 1;
string namespace = 2;
string description = 3;
}
message NumberFeature {
string type = 1;
oneof number {
Integer v0 = 2;
Number v1 = 3;
};
string description = 5;
}
message OSFeature {
string type = 1;
string os = 2;
string description = 3;
}
message OffsetFeature {
string type = 1;
Integer offset = 2;
string description = 3;
}
message OperandNumberFeature {
string type = 1;
Integer index = 2;
Integer operand_number = 3;
string description = 4;
}
message OperandOffsetFeature {
string type = 1;
Integer index = 2;
Integer operand_offset = 3;
string description = 4;
}
message PropertyFeature {
string type = 1;
string access = 2;
string property = 3;
string description = 4;
}
message RangeStatement {
string description = 1;
Integer min = 2;
Integer max = 3;
oneof child {
OSFeature v0 = 4;
ArchFeature v1 = 5;
FormatFeature v2 = 6;
MatchFeature v3 = 7;
CharacteristicFeature v4 = 8;
ExportFeature v5 = 9;
ImportFeature v6 = 10;
SectionFeature v7 = 11;
FunctionNameFeature v8 = 12;
SubstringFeature v9 = 13;
RegexFeature v10 = 14;
StringFeature v11 = 15;
ClassFeature v12 = 16;
NamespaceFeature v13 = 17;
APIFeature v14 = 18;
PropertyFeature v15 = 19;
NumberFeature v16 = 20;
BytesFeature v17 = 21;
OffsetFeature v18 = 22;
MnemonicFeature v19 = 23;
OperandNumberFeature v20 = 24;
OperandOffsetFeature v21 = 25;
BasicBlockFeature v22 = 26;
};
string type = 28;
}
message RegexFeature {
string type = 1;
string regex = 2;
string description = 3;
}
message ResultDocument {
Metadata meta = 1;
map <string, RuleMatches> rules = 2;
}
message RuleMatches {
RuleMetadata meta = 1;
string source = 2;
repeated Pair_Address_Match matches = 3;
}
message RuleMetadata {
string name = 1;
string namespace = 2;
repeated string authors = 3;
Scope scope = 4;
repeated AttackSpec attack = 5;
repeated MBCSpec mbc = 6;
repeated string references = 7;
repeated string examples = 8;
string description = 9;
bool lib = 10;
MaecMetadata maec = 11;
bool capa_subscope = 12;
}
message Sample {
string md5 = 1;
string sha1 = 2;
string sha256 = 3;
string path = 4;
}
enum Scope {
SCOPE_UNSPECIFIED = 0;
SCOPE_FILE = 1;
SCOPE_FUNCTION = 2;
SCOPE_BASIC_BLOCK = 3;
SCOPE_INSTRUCTION = 4;
}
message SectionFeature {
string type = 1;
string section = 2;
string description = 3;
}
message SomeStatement {
string description = 1;
Integer count = 2;
string type = 3;
}
message StatementNode {
oneof statement {
RangeStatement v0 = 1;
SomeStatement v1 = 2;
SubscopeStatement v2 = 3;
CompoundStatement v3 = 4;
};
string type = 6;
}
message StringFeature {
string type = 1;
string string = 2;
string description = 3;
}
message SubscopeStatement {
string description = 1;
Scope scope = 2;
string type = 3;
}
message SubstringFeature {
string type = 1;
string substring = 2;
string description = 3;
}
message Array_Address { repeated Address values = 1; }
message Pair_Address_Match {
Address v0 = 1;
Match v1 = 2;
}
message Pair_Integer_Integer {
Integer v0 = 1;
Integer v1 = 2;
}
message Integer { oneof value { uint64 u = 1; int64 i = 2; } }
message Number { oneof value { uint64 u = 1; int64 i = 2; double f = 3; } }

File diff suppressed because one or more lines are too long

37
tests/test_proto.py Normal file
View File

@@ -0,0 +1,37 @@
# Copyright (C) 2023 FireEye, Inc. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at: [package root]/LICENSE.txt
# Unless required by applicable law or agreed to in writing, software distributed under the License
# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and limitations under the License.
import pathlib
import subprocess
import capa.render
import capa.render.proto
import capa.render.utils
import capa.features.freeze
import capa.render.result_document
import capa.features.freeze.features
def test_generate_proto(tmp_path: pathlib.Path):
tmp_path.mkdir(exist_ok=True, parents=True)
proto_path = tmp_path / "capa.proto"
proto = capa.render.proto.generate_proto()
print("=====================================")
print(proto_path)
print("-------------------------------------")
for i, line in enumerate(proto.split("\n")):
print(f" {i} | {line}")
print("=====================================")
proto_path.write_text(proto)
subprocess.run(["protoc", "-I=" + str(tmp_path), "--python_out=" + str(tmp_path), str(proto_path)], check=True)
pb = tmp_path / "capa_pb2.py"
print(pb.read_text())
print("=====================================")