Merge pull request #421 from Ana06/viv-py3

2025-12-12 15:49:46 -08:00 · 2021-03-03 21:40:08 +01:00
parent 1ecaad5413 c522f5094a
commit 829274cd5e
15 changed files with 158 additions and 38 deletions
--- a/capa/features/extractors/viv/init.py
+++ b/capa/features/extractors/viv/init.py
@@ -8,11 +8,7 @@

 import types

-import file
-import insn
-import function
 import viv_utils
-import basicblock

 import capa.features.extractors
 import capa.features.extractors.viv.file
@@ -42,7 +38,7 @@ def add_va_int_cast(o):
    this bit of skullduggery lets use cast viv-utils objects as ints.
    the correct way of doing this is to update viv-utils (or subclass the objects here).
    """
-    setattr(o, "__int__", types.MethodType(get_va, o, type(o)))
+    setattr(o, "__int__", types.MethodType(get_va, o))
    return o


--- a/capa/features/extractors/viv/basicblock.py
+++ b/capa/features/extractors/viv/basicblock.py
@@ -125,11 +125,16 @@ def get_printable_len(oper):


 def is_printable_ascii(chars):
-    return all(ord(c) < 127 and c in string.printable for c in chars)
+    try:
+        chars_str = chars.decode("ascii")
+    except UnicodeDecodeError:
+        return False
+    else:
+        return all(c in string.printable for c in chars_str)


 def is_printable_utf16le(chars):
-    if all(c == "\x00" for c in chars[1::2]):
+    if all(c == b"\x00" for c in chars[1::2]):
        return is_printable_ascii(chars[::2])


--- a/capa/features/extractors/viv/insn.py
+++ b/capa/features/extractors/viv/insn.py
@@ -239,7 +239,7 @@ def read_bytes(vw, va):
    """
    segm = vw.getSegment(va)
    if not segm:
-        raise envi.SegmentationViolation()
+        raise envi.SegmentationViolation(va)

    segm_end = segm[0] + segm[1]
    try:
--- a/capa/features/freeze.py
+++ b/capa/features/freeze.py
@@ -264,6 +264,15 @@ def main(argv=None):
    parser.add_argument(
        "-f", "--format", choices=[f[0] for f in formats], default="auto", help="Select sample format, %s" % format_help
    )
+    if sys.version_info >= (3, 0):
+        parser.add_argument(
+            "-b",
+            "--backend",
+            type=str,
+            help="select the backend to use",
+            choices=(capa.main.BACKEND_VIV, capa.main.BACKEND_SMDA),
+            default=capa.main.BACKEND_VIV,
+        )
    args = parser.parse_args(args=argv)

    if args.quiet:
@@ -276,7 +285,8 @@ def main(argv=None):
        logging.basicConfig(level=logging.INFO)
        logging.getLogger().setLevel(logging.INFO)

-    extractor = capa.main.get_extractor(args.sample, args.format)
+    backend = args.backend if sys.version_info > (3, 0) else capa.main.BACKEND_VIV
+    extractor = capa.main.get_extractor(args.sample, args.format, backend)
    with open(args.output, "wb") as f:
        f.write(dump(extractor))

--- a/capa/main.py
+++ b/capa/main.py
@@ -32,7 +32,9 @@ import capa.features.extractors
 from capa.helpers import oint, get_file_taste

 RULES_PATH_DEFAULT_STRING = "(embedded rules)"
-SUPPORTED_FILE_MAGIC = set(["MZ"])
+SUPPORTED_FILE_MAGIC = set([b"MZ"])
+BACKEND_VIV = "vivisect"
+BACKEND_SMDA = "smda"


 logger = logging.getLogger("capa")
@@ -303,29 +305,43 @@ class UnsupportedRuntimeError(RuntimeError):
    pass


-def get_extractor_py3(path, format, disable_progress=False):
-    from smda.SmdaConfig import SmdaConfig
-    from smda.Disassembler import Disassembler
+def get_extractor_py3(path, format, backend, disable_progress=False):
+    if backend == "smda":
+        from smda.SmdaConfig import SmdaConfig
+        from smda.Disassembler import Disassembler

-    import capa.features.extractors.smda
+        import capa.features.extractors.smda

-    smda_report = None
-    with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
-        config = SmdaConfig()
-        config.STORE_BUFFER = True
-        smda_disasm = Disassembler(config)
-        smda_report = smda_disasm.disassembleFile(path)
+        smda_report = None
+        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
+            config = SmdaConfig()
+            config.STORE_BUFFER = True
+            smda_disasm = Disassembler(config)
+            smda_report = smda_disasm.disassembleFile(path)

-    return capa.features.extractors.smda.SmdaFeatureExtractor(smda_report, path)
+        return capa.features.extractors.smda.SmdaFeatureExtractor(smda_report, path)
+    else:
+        import capa.features.extractors.viv
+
+        with halo.Halo(text="analyzing program", spinner="simpleDots", stream=sys.stderr, enabled=not disable_progress):
+            vw = get_workspace(path, format, should_save=False)
+
+            try:
+                vw.saveWorkspace()
+            except IOError:
+                # see #168 for discussion around how to handle non-writable directories
+                logger.info("source directory is not writable, won't save intermediate workspace")
+
+        return capa.features.extractors.viv.VivisectFeatureExtractor(vw, path)


-def get_extractor(path, format, disable_progress=False):
+def get_extractor(path, format, backend, disable_progress=False):
    """
    raises:
      UnsupportedFormatError:
    """
    if sys.version_info >= (3, 0):
-        return get_extractor_py3(path, format, disable_progress=disable_progress)
+        return get_extractor_py3(path, format, backend, disable_progress=disable_progress)
    else:
        return get_extractor_py2(path, format, disable_progress=disable_progress)

@@ -501,6 +517,15 @@ def main(argv=None):
    parser.add_argument(
        "-f", "--format", choices=[f[0] for f in formats], default="auto", help="select sample format, %s" % format_help
    )
+    if sys.version_info >= (3, 0):
+        parser.add_argument(
+            "-b",
+            "--backend",
+            type=str,
+            help="select the backend to use",
+            choices=(BACKEND_VIV, BACKEND_SMDA),
+            default=BACKEND_VIV,
+        )
    parser.add_argument("-t", "--tag", type=str, help="filter on rule meta field values")
    parser.add_argument("-j", "--json", action="store_true", help="emit JSON instead of text")
    parser.add_argument(
@@ -605,7 +630,8 @@ def main(argv=None):
    else:
        format = args.format
        try:
-            extractor = get_extractor(args.sample, args.format, disable_progress=args.quiet)
+            backend = args.backend if sys.version_info > (3, 0) else capa.main.BACKEND_VIV
+            extractor = get_extractor(args.sample, args.format, backend, disable_progress=args.quiet)
        except UnsupportedFormatError:
            logger.error("-" * 80)
            logger.error(" Input file does not appear to be a PE file.")
--- a/scripts/bulk-process.py
+++ b/scripts/bulk-process.py
@@ -95,7 +95,7 @@ def get_capa_results(args):
    rules, format, path = args
    logger.info("computing capa results for: %s", path)
    try:
-        extractor = capa.main.get_extractor(path, format, disable_progress=True)
+        extractor = capa.main.get_extractor(path, format, capa.main.BACKEND_VIV, disable_progress=True)
    except capa.main.UnsupportedFormatError:
        # i'm 100% sure if multiprocessing will reliably raise exceptions across process boundaries.
        # so instead, return an object with explicit success/failure status.
--- a/scripts/capa_as_library.py
+++ b/scripts/capa_as_library.py
@@ -191,7 +191,7 @@ def render_dictionary(doc):
 def capa_details(file_path, output_format="dictionary"):

    # extract features and find capabilities
-    extractor = capa.main.get_extractor(file_path, "auto", disable_progress=True)
+    extractor = capa.main.get_extractor(file_path, "auto", capa.main.BACKEND_VIV, disable_progress=True)
    capabilities, counts = capa.main.find_capabilities(rules, extractor, disable_progress=True)

    # collect metadata (used only to make rendering more complete)
--- a/scripts/lint.py
+++ b/scripts/lint.py
@@ -201,7 +201,7 @@ class DoesntMatchExample(Lint):
                continue

            try:
-                extractor = capa.main.get_extractor(path, "auto", disable_progress=True)
+                extractor = capa.main.get_extractor(path, "auto", capa.main.BACKEND_VIV, disable_progress=True)
                capabilities, meta = capa.main.find_capabilities(ctx["rules"], extractor, disable_progress=True)
            except Exception as e:
                logger.error("failed to extract capabilities: %s %s %s", rule.name, path, e)
--- a/scripts/show-capabilities-by-function.py
+++ b/scripts/show-capabilities-by-function.py
@@ -199,7 +199,7 @@ def main(argv=None):
        else:
            format = args.format
            try:
-                extractor = capa.main.get_extractor(args.sample, args.format)
+                extractor = capa.main.get_extractor(args.sample, args.format, capa.main.BACKEND_VIV)
            except capa.main.UnsupportedFormatError:
                logger.error("-" * 80)
                logger.error(" Input file does not appear to be a PE file.")
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -125,7 +125,7 @@ def main(argv=None):
            extractor = capa.features.freeze.load(f.read())
    else:
        try:
-            extractor = capa.main.get_extractor(args.sample, args.format)
+            extractor = capa.main.get_extractor(args.sample, args.format, capa.main.BACKEND_VIV)
        except capa.main.UnsupportedFormatError:
            logger.error("-" * 80)
            logger.error(" Input file does not appear to be a PE file.")
--- a/scripts/vivisect-py2-vs-py3.sh
+++ b/scripts/vivisect-py2-vs-py3.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+int() {
+  int=$(bc <<< "scale=0; ($1 + 0.5)/1")
+}
+
+export TIMEFORMAT='%3R'
+threshold_time=90
+threshold_py3_time=60 # Do not warn if it doesn't take at least 1 minute to run
+rm tests/data/*.viv 2>/dev/null
+mkdir results
+for file in tests/data/*
+do
+  file=$(printf %q "$file") # Handle names with white spaces
+  file_name=$(basename $file)
+  echo $file_name
+
+  rm "$file.viv" 2>/dev/null
+  py3_time=$(sh -c "time python3 scripts/show-features.py $file >> results/p3-$file_name.out 2>/dev/null" 2>&1)
+  rm "$file.viv" 2>/dev/null
+  py2_time=$(sh -c "time python2 scripts/show-features.py $file >> results/p2-$file_name.out 2>/dev/null" 2>&1)
+
+  int $py3_time
+  if (($int > $threshold_py3_time))
+  then
+    percentage=$(bc <<< "scale=3; $py2_time/$py3_time*100 + 0.5")
+    int $percentage
+    if (($int < $threshold_py3_time))
+    then
+      echo -n "  SLOWER ($percentage): "
+    fi
+  fi
+  echo "  PY2($py2_time) PY3($py3_time)"
+done
+
+threshold_features=98
+counter=0
+average=0
+results_for() {
+  py3=$(cat "results/p3-$file_name.out" | grep "$1" | wc -l)
+  py2=$(cat "results/p2-$file_name.out" | grep "$1" | wc -l)
+  if (($py2 > 0))
+  then
+    percentage=$(bc <<< "scale=2; 100*$py3/$py2")
+    average=$(bc <<< "scale=2; $percentage + $average")
+    count=$(($count + 1))
+    int $percentage
+    if (($int < $threshold_features))
+    then
+      echo -e "$1: py2($py2) py3($py3) $percentage% - $file_name"
+    fi
+  fi
+}
+
+rm tests/data/*.viv 2>/dev/null
+echo -e '\nRESULTS:'
+for file in tests/data/*
+do
+  file_name=$(basename $file)
+  if test -f "results/p2-$file_name.out"; then
+    results_for 'insn'
+    results_for 'file'
+    results_for 'func'
+    results_for 'bb'
+  fi
+done
+
+average=$(bc <<< "scale=2; $average/$count")
+echo "TOTAL: $average"
--- a/setup.py
+++ b/setup.py
@@ -27,6 +27,8 @@ if sys.version_info >= (3, 0):
    # py3
    requirements.append("halo")
    requirements.append("networkx")
+    requirements.append("vivisect==1.0.0")
+    requirements.append("viv-utils==0.3.19")
    requirements.append("smda==1.5.13")
 else:
    # py2
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -520,11 +520,7 @@ def do_test_feature_count(get_extractor, sample, scope, feature, expected):


 def get_extractor(path):
-    if sys.version_info >= (3, 0):
-        extractor = get_smda_extractor(path)
-    else:
-        extractor = get_viv_extractor(path)
-
+    extractor = get_viv_extractor(path)
    # overload the extractor so that the fixture exposes `extractor.path`
    setattr(extractor, "path", path)
    return extractor
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -7,6 +7,7 @@
 #  is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and limitations under the License.
 import sys
+import json
 import textwrap

 import pytest
@@ -365,3 +366,20 @@ def test_not_render_rules_also_matched(z9324d_extractor, capsys):
    assert "act as TCP client" in std.out
    assert "connect TCP socket" in std.out
    assert "create TCP socket" in std.out
+
+
+# It tests main works with different backends
+def test_backend_option(capsys):
+    if sys.version_info > (3, 0):
+        path = get_data_path_by_name("pma16-01")
+        assert capa.main.main([path, "-j", "-b", capa.main.BACKEND_VIV]) == 0
+        std = capsys.readouterr()
+        std_json = json.loads(std.out)
+        assert std_json["meta"]["analysis"]["extractor"] == "VivisectFeatureExtractor"
+        assert len(std_json["rules"]) > 0
+
+        assert capa.main.main([path, "-j", "-b", capa.main.BACKEND_SMDA]) == 0
+        std = capsys.readouterr()
+        std_json = json.loads(std.out)
+        assert std_json["meta"]["analysis"]["extractor"] == "SmdaFeatureExtractor"
+        assert len(std_json["rules"]) > 0
--- a/tests/test_viv_features.py
+++ b/tests/test_viv_features.py
@@ -16,8 +16,7 @@ from fixtures import *
    indirect=["sample", "scope"],
 )
 def test_viv_features(sample, scope, feature, expected):
-    with xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2"):
-        do_test_feature_presence(get_viv_extractor, sample, scope, feature, expected)
+    do_test_feature_presence(get_viv_extractor, sample, scope, feature, expected)


@parametrize(
@@ -26,5 +25,4 @@ def test_viv_features(sample, scope, feature, expected):
    indirect=["sample", "scope"],
 )
 def test_viv_feature_counts(sample, scope, feature, expected):
-    with xfail(sys.version_info >= (3, 0), reason="vivsect only works on py2"):
-        do_test_feature_count(get_viv_extractor, sample, scope, feature, expected)
+    do_test_feature_count(get_viv_extractor, sample, scope, feature, expected)