Merge pull request #1 from colton-gabertan/ghidra_backend

Ghidra backend
2025-12-12 15:49:46 -08:00 · 2023-06-02 23:24:43 -07:00
parent 0cbe4618e1 a7639d33b9
commit de19c9300d
10 changed files with 258 additions and 3 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,6 +1,6 @@
 name: CI

-on:
+on: 
  push:
    branches: [ master ]
  pull_request:
@@ -136,3 +136,55 @@ jobs:
      env:
        BN_LICENSE: ${{ secrets.BN_LICENSE }}
      run: pytest -v tests/test_binja_features.py  # explicitly refer to the binja tests for performance. other tests run above.
+
+  ghidra-tests:
+    name: Ghidra tests for ${{ matrix.python-version }}
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.7", "3.11"]
+        java-version: ["17"]
+    steps:
+    - name: Checkout capa with submodules
+      uses: actions/checkout@ac593985615ec2ede58e132d2e21d2b1cbd6127c # v3.3.0
+      with:
+        submodules: recursive
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@d27e3f3d7c64b4bbf8e4abfb9b63b83e846e0435 # v4.5.0
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Set up Java ${{ matrix.java-version }}
+      uses: actions/setup-java@5ffc13f4174014e2d4d4572b3d74c3fa61aeb2c2 # v3
+      with:
+        distribution: 'temurin'
+        java-version: ${{ matrix.java-version }}
+    - name: Set up Gradle 7.3 # must be done manually due to no gradle build in capa
+      run: |
+        mkdir /opt/gradle
+        wget "https://services.gradle.org/distributions/gradle-7.3-bin.zip" -O /opt/gradle/gradle-7.3.zip
+        unzip /opt/gradle/gradle-7.3.zip -d /opt/gradle
+    - name: Install Ghidra 10.3
+      run: |
+        mkdir ./.github/ghidra
+        wget "https://github.com/NationalSecurityAgency/ghidra/releases/download/Ghidra_10.3_build/ghidra_10.3_PUBLIC_20230510.zip" -O ./.github/ghidra/ghidra_10.3_PUBLIC.zip
+        unzip .github/ghidra/ghidra_10.3_PUBLIC.zip -d .github/ghidra/
+    - name: Install Jep 4.1.1
+      run : |
+        mkdir ./.github/jep
+        wget "https://github.com/ninia/jep/archive/refs/tags/v4.1.1.zip" -O ./.github/jep/jep-4.1.1.zip
+        unzip .github/jep/jep-4.1.1.zip -d .github/jep/
+        pip install .github/jep/jep-4.1.1/
+    - name: Install Ghidrathon
+      run : |
+        mkdir ./.github/ghidrathon
+        wget "https://github.com/mandiant/Ghidrathon/archive/refs/tags/v2.1.0.zip" -O ./.github/ghidrathon/ghidrathon-2.1.0.zip
+        unzip .github/ghidrathon/ghidrathon-2.1.0.zip -d .github/ghidrathon/
+        workdir=$(pwd)
+        /opt/gradle/gradle-7.3/bin/gradle -p ./.github/ghidrathon/Ghidrathon-2.1.0/ -PGHIDRA_INSTALL_DIR=$workdir/.github/ghidra/ghidra_10.3_PUBLIC 
+        unzip .github/ghidrathon/Ghidrathon-2.1.0/dist/*.zip -d $workdir/.github/ghidra/ghidra_10.3_PUBLIC/Extensions
+    - name: Install pyyaml
+      run: sudo apt-get install -y libyaml-dev
+    - name: Install capa
+      run: pip install -e .[dev]
+ 
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,6 +4,7 @@

 ### New Features
 - Utility script to detect feature overlap between new and existing CAPA rules [#1451](https://github.com/mandiant/capa/issues/1451) [@Aayush-Goel-04](https://github.com/aayush-goel-04)
+- extractor: Implement Ghidra Backend [@colton-gabertan](https://github.com/colton-gabertan)

 ### Breaking Changes

@@ -16,7 +17,7 @@
 - communication/mailslot/read-from-mailslot nick.simonian@mandiant.com
 - nursery/hash-data-using-sha512managed-in-dotnet jonathanlepore@google.com
 - nursery/compiled-with-exescript jonathanlepore@google.com
-
+

 ### Bug Fixes
 - extractor: update vivisect Arch extraction #1334 @mr-tz
--- a/capa/features/extractors/ghidra/init.py
+++ b/capa/features/extractors/ghidra/init.py
--- a/capa/features/extractors/ghidra/extractor.py
+++ b/capa/features/extractors/ghidra/extractor.py
@@ -0,0 +1,15 @@
+import logging
+import contextlib
+from typing import Tuple, Iterator
+
+from capa.features.common import Feature
+from capa.features.address import Address, AbsoluteVirtualAddress
+from capa.features.extractors.base_extractor import FeatureExtractor
+import capa.features.extractors.ghidra.global_
+
+class GhidraFeatureExtractor(FeatureExtractor):
+    def __init__(self):
+        super().__init__()
+        self.global_features: List[Tuple[Feature, Address]] = []
+        self.global_features.extend(capa.features.extractors.ghidra.global_.extract_os())
+        self.global_features.extend(capa.features.extractors.ghidra.global_.extract_arch())
--- a/capa/features/extractors/ghidra/file.py
+++ b/capa/features/extractors/ghidra/file.py
--- a/capa/features/extractors/ghidra/global_.py
+++ b/capa/features/extractors/ghidra/global_.py
@@ -0,0 +1,81 @@
+import logging
+import contextlib
+from io import BytesIO
+from typing import Tuple, Iterator 
+
+# imports for clarity
+#   note: currentProgram is a static variable accessible in
+#         the specific ghidra runtime environment
+import ghidra.program.database.mem
+import ghidra.program.flatapi as flatapi
+ghidraapi = flatapi.FlatProgramAPI(currentProgram) # Ghidrathon hacks :)
+
+import capa.features.extractors.elf
+from capa.features.common import OS, ARCH_I386, ARCH_AMD64, OS_WINDOWS, Arch, Feature
+from capa.features.address import NO_ADDRESS, Address
+
+logger = logging.getLogger(__name__)
+
+def extract_os() -> Iterator[Tuple[Feature, Address]]:
+    current_program = ghidraapi.getCurrentProgram()
+    format_name: str = current_program.getExecutableFormat()
+
+    if "PE" in format_name:
+        yield OS(OS_WINDOWS), NO_ADDRESS
+
+    elif "ELF" in format_name:
+        program_memory = current_program.getMemory()   # ghidra.program.database.mem.MemoryMapDB
+        fbytes_list = program_memory.getAllFileBytes() # java.util.List<FileBytes>
+        fbytes = fbytes_list[0]                        # ghidra.program.database.mem.FileBytes
+
+        # Java likes to return signed ints, so we must convert them
+        # back into unsigned bytes manually and write to BytesIO
+        #   note: May be deprecated if Jep has implements better support for Java Lists 
+        pb_arr = b''
+        for i in range(fbytes.getSize()):
+            pb_arr = pb_arr + (fbytes.getOriginalByte(i) & 0xff).to_bytes(1, 'little')
+        buf = BytesIO(pb_arr)
+
+        with contextlib.closing(buf) as f:
+            os = capa.features.extractors.elf.detect_elf_os(f)
+
+        yield OS(os), NO_ADDRESS
+
+    else:
+        # we likely end up here:
+        #  1. handling shellcode, or
+        #  2. handling a new file format (e.g. macho)
+        #
+        # for (1) we can't do much - its shellcode and all bets are off.
+        # we could maybe accept a further CLI argument to specify the OS,
+        # but i think this would be rarely used.
+        # rules that rely on OS conditions will fail to match on shellcode.
+        #
+        # for (2), this logic will need to be updated as the format is implemented.
+        logger.debug("unsupported file format: %s, will not guess OS", format_name)
+        return
+
+
+def extract_arch() -> Iterator[Tuple[Feature, Address]]:
+    current_program = ghidraapi.getCurrentProgram()
+    lang_id = current_program.getMetadata().get('Language ID')
+
+    if 'x86' in lang_id and '64' in lang_id:
+        yield Arch(ARCH_AMD64), NO_ADDRESS
+
+    elif 'x86' in lang_id and '32' in lang_id:
+        yield Arch(ARCH_I386), NO_ADDRESS
+
+    elif 'x86' not in lang_id:
+        logger.debug("unsupported architecture: non-32-bit nor non-64-bit intel")
+        return
+
+    else:
+        # we likely end up here:
+        #  1. handling a new architecture (e.g. aarch64)
+        #
+        # for (1), this logic will need to be updated as the format is implemented.
+        logger.debug("unsupported architecture: %s", lang_id)
+        return
+
+
--- a/capa/main.py
+++ b/capa/main.py
@@ -1317,6 +1317,33 @@ def ida_main():
    print(capa.render.default.render(meta, rules, capabilities))


+def ghidra_main():
+    import capa.rules
+    #import capa.render.default
+    #import capa.features.extractors.ghidra.extractor
+    import capa.features.extractors.ghidra.global_
+
+    logging.basicConfig(level=logging.INFO)
+    logging.getLogger().setLevel(logging.INFO)
+
+    logger.debug("-" * 80)
+    logger.debug(" Using default embedded rules.")
+    logger.debug(" ")
+    logger.debug(" You can see the current default rule set here:")
+    logger.debug("     https://github.com/mandiant/capa-rules")
+    logger.debug("-" * 80)
+
+    rules_path = os.path.join(get_default_root(), "rules")
+    logger.debug("rule path: %s", rules_path)
+    rules = get_rules([rules_path])
+    
+    # temp test for OS & ARCH extractions
+    globl_features: List[Tuple[Feature, Address]] = []
+    globl_features.extend(capa.features.extractors.ghidra.global_.extract_os())
+    globl_features.extend(capa.features.extractors.ghidra.global_.extract_arch())
+    print(globl_features)
+
+
 def is_runtime_ida():
    try:
        import idc
@@ -1326,8 +1353,20 @@ def is_runtime_ida():
        return True


+def is_runtime_ghidra():
+    try:
+        import ghidra.program.flatapi
+    except ImportError:
+        return False
+    else:
+        return True
+
+
 if __name__ == "__main__":
    if is_runtime_ida():
        ida_main()
+    elif is_runtime_ghidra():
+        ghidra_main()
    else:
        sys.exit(main())
+
--- a/2
+++ b/2
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -183,6 +183,18 @@ def get_binja_extractor(path):
    return extractor


+@lru_cache(maxsize=1)
+def get_ghidra_extractor(path):
+    import capa.features.extractors.ghidra.extractor
+
+    extractor = capa.features.extractors.ghidra.extractor.GhidraFeatureExtractor(path)
+
+    # overload the extractor so that the fixture exposes `extractor.path`
+    setattr(extractor, "path", path)
+
+    return extractor
+
+
 def extract_global_features(extractor):
    features = collections.defaultdict(set)
    for feature, va in extractor.extract_global_features():
--- a/tests/test_ghidra_features.py
+++ b/tests/test_ghidra_features.py
@@ -0,0 +1,55 @@
+import sys
+import logging
+import os.path
+import binascii
+import traceback
+
+import pytest
+
+try:
+    sys.path.append(os.path.dirname(__file__))
+    import fixtures
+    from fixtures import *
+finally:
+    sys.path.pop()
+
+
+logger = logging.getLogger("test_ghidra_features")
+
+
+# We need to skip the ghidra test if we cannot import ghidra modules, e.g., in GitHub CI.
+ghidra_present: bool = False
+try:
+    import ghidra.program.flatapi as flatapi
+    ghidraapi = flatapi.FlatProgramAPI(currentProgram) 
+
+    try:
+        current_program_test = ghidraapi.getCurrentProgram()
+    except RuntimeError as e:
+        logger.warning("Ghidra runtime not detected")
+    else:
+        ghidra_present = True
+except ImportError:
+    pass
+
+
+@pytest.mark.skipif(ghidra_present is False, reason="Skip ghidra tests if the ghidra Python API is not installed")
+@fixtures.parametrize(
+    "sample,scope,feature,expected",
+    fixtures.FEATURE_PRESENCE_TESTS,
+    indirect=["sample", "scope"],
+)
+def test_ghidra_features(sample, scope, feature, expected):
+    fixtures.do_test_feature_presence(fixtures.get_ghidra_extractor, sample, scope, feature, expected)
+
+
+@pytest.mark.skipif(ghidra_present is False, reason="Skip ghidra tests if the ghidra Python API is not installed")
+@fixtures.parametrize(
+    "sample,scope,feature,expected",
+    fixtures.FEATURE_COUNT_TESTS,
+    indirect=["sample", "scope"],
+)
+def test_ghidra_feature_counts(sample, scope, feature, expected):
+    fixtures.do_test_feature_count(fixtures.get_ghidra_extractor, sample, scope, feature, expected)
+
+