introduce flake8-todos linter

2025-12-12 15:49:46 -08:00 · 2023-07-09 23:35:52 +02:00
parent 4a49543d12
commit ae10a2ea34
19 changed files with 41 additions and 41 deletions
--- a/capa/features/common.py
+++ b/capa/features/common.py
@@ -21,6 +21,7 @@ if TYPE_CHECKING:
 import capa.perf
 import capa.features
 import capa.features.extractors.elf
+import capa.features.freeze.features
 from capa.features.address import Address

 logger = logging.getLogger(__name__)
@@ -127,9 +128,9 @@ class Feature(abc.ABC):  # noqa: B024
        return self.name == other.name and self.value == other.value

    def __lt__(self, other):
-        # TODO: this is a huge hack!
-        import capa.features.freeze.features
-
+        # implementing sorting by serializing to JSON is a huge hack.
+        # its slow, inelegant, and probably doesn't work intuitively;
+        # however, we only use it for deterministic output, so it's good enough for now.
        return (
            capa.features.freeze.features.feature_from_capa(self).json()
            < capa.features.freeze.features.feature_from_capa(other).json()
--- a/capa/features/extractors/binja/helpers.py
+++ b/capa/features/extractors/binja/helpers.py
@@ -41,7 +41,9 @@ def unmangle_c_name(name: str) -> str:
    # _lstrlenWStub@4

    # A small optimization to avoid running the regex too many times
-    # TODO: this still increases the unit test execution time from 170s to 200s, should be able to accelerate it
+    # this still increases the unit test execution time from 170s to 200s, should be able to accelerate it
+    #
+    # TODO(xusheng): performance optimizations to improve test execution time
    if name[0] in ["@", "_"]:
        match = re.match(r"^[@|_](.*?)(Stub)?(@\d+)?$", name)
        if match:
--- a/capa/features/extractors/binja/insn.py
+++ b/capa/features/extractors/binja/insn.py
@@ -324,7 +324,7 @@ def extract_insn_offset_features(

 def is_nzxor_stack_cookie(f: Function, bb: BinjaBasicBlock, llil: LowLevelILInstruction) -> bool:
    """check if nzxor exists within stack cookie delta"""
-    # TODO: we can do a much accurate analysi using LLIL SSA
+    # TODO(xusheng): use LLIL SSA to do more accurate analysis

    reg_names = []
    if llil.left.operation == LowLevelILOperation.LLIL_REG:
--- a/capa/features/extractors/elffile.py
+++ b/capa/features/extractors/elffile.py
@@ -36,8 +36,8 @@ def extract_file_import_names(elf, **kwargs):

        for _, symbol in enumerate(section.iter_symbols()):
            if symbol.name and symbol.entry.st_info.type == "STT_FUNC":
-                # TODO symbol address
-                # TODO symbol version info?
+                # TODO(williballenthin): extract symbol address
+                # TODO(williballenthin): is it useful to extract the symbol version info?
                yield Import(symbol.name), FileOffsetAddress(0x0)


@@ -68,7 +68,7 @@ def extract_file_format(**kwargs):


 def extract_file_arch(elf, **kwargs):
-    # TODO merge with capa.features.extractors.elf.detect_elf_arch()
+    # TODO(williballenthin): merge with capa.features.extractors.elf.detect_elf_arch()
    arch = elf.get_machine_arch()
    if arch == "x86":
        yield Arch("i386"), NO_ADDRESS
@@ -85,7 +85,7 @@ def extract_file_features(elf: ELFFile, buf: bytes) -> Iterator[Tuple[Feature, i


 FILE_HANDLERS = (
-    # TODO extract_file_export_names,
+    # TODO(williballenthin): implement extract_file_export_names
    extract_file_import_names,
    extract_file_section_names,
    extract_file_strings,
--- a/capa/features/extractors/ida/helpers.py
+++ b/capa/features/extractors/ida/helpers.py
@@ -28,7 +28,7 @@ def find_byte_sequence(start: int, end: int, seq: bytes) -> Iterator[int]:
    """
    seqstr = " ".join([f"{b:02x}" for b in seq])
    while True:
-        # TODO find_binary: Deprecated. Please use ida_bytes.bin_search() instead.
+        # TODO(mike-hunhoff): find_binary is deprecated. Please use ida_bytes.bin_search() instead.
        ea = idaapi.find_binary(start, end, seqstr, 0, idaapi.SEARCH_DOWN)
        if ea == idaapi.BADADDR:
            break
@@ -106,7 +106,8 @@ def get_file_imports() -> Dict[int, Tuple[str, str, int]]:

        # IDA uses section names for the library of ELF imports, like ".dynsym".
        # These are not useful to us, we may need to expand this list over time
-        # TODO: exhaust this list, see #1419
+        # TODO(williballenthin): find all section names used by IDA
+        # https://github.com/mandiant/capa/issues/1419
        if library == ".dynsym":
            library = ""

--- a/capa/features/extractors/ida/insn.py
+++ b/capa/features/extractors/ida/insn.py
@@ -405,7 +405,8 @@ def extract_insn_peb_access_characteristic_features(
    disasm = idc.GetDisasm(insn.ea)

    if " fs:30h" in disasm or " gs:60h" in disasm:
-        # TODO: replace above with proper IDA
+        # TODO(mike-hunhoff): use proper IDA API for fetching segment access
+        # scanning the disassembly text is a hack.
        yield Characteristic("peb access"), ih.address


@@ -426,11 +427,13 @@ def extract_insn_segment_access_features(
    disasm = idc.GetDisasm(insn.ea)

    if " fs:" in disasm:
-        # TODO: replace above with proper IDA
+        # TODO(mike-hunhoff): use proper IDA API for fetching segment access
+        # scanning the disassembly text is a hack.
        yield Characteristic("fs access"), ih.address

    if " gs:" in disasm:
-        # TODO: replace above with proper IDA
+        # TODO(mike-hunhoff): use proper IDA API for fetching segment access
+        # scanning the disassembly text is a hack.
        yield Characteristic("gs access"), ih.address


--- a/capa/features/extractors/viv/basicblock.py
+++ b/capa/features/extractors/viv/basicblock.py
@@ -92,7 +92,6 @@ def is_mov_imm_to_stack(instr: envi.archs.i386.disasm.i386Opcode) -> bool:
    if not src.isImmed():
        return False

-    # TODO what about 64-bit operands?
    if not isinstance(dst, envi.archs.i386.disasm.i386SibOper) and not isinstance(
        dst, envi.archs.i386.disasm.i386RegMemOper
    ):
--- a/capa/features/extractors/viv/insn.py
+++ b/capa/features/extractors/viv/insn.py
@@ -351,7 +351,6 @@ def is_security_cookie(f, bb, insn) -> bool:
    if oper.isReg() and oper.reg not in [
        envi.archs.i386.regs.REG_ESP,
        envi.archs.i386.regs.REG_EBP,
-        # TODO: do x64 support for real.
        envi.archs.amd64.regs.REG_RBP,
        envi.archs.amd64.regs.REG_RSP,
    ]:
@@ -422,7 +421,6 @@ def extract_insn_peb_access_characteristic_features(f, bb, ih: InsnHandle) -> It
    """
    parse peb access from the given function. fs:[0x30] on x86, gs:[0x60] on x64
    """
-    # TODO handle where fs/gs are loaded into a register or onto the stack and used later
    insn: envi.Opcode = ih.inner

    if insn.mnem not in ["push", "mov"]:
@@ -646,7 +644,6 @@ def extract_op_offset_features(
        if oper.reg == envi.archs.i386.regs.REG_EBP:
            return

-        # TODO: do x64 support for real.
        if oper.reg == envi.archs.amd64.regs.REG_RBP:
            return

--- a/capa/ida/plugin/cache.py
+++ b/capa/ida/plugin/cache.py
@@ -37,13 +37,13 @@ class CapaRuleGenFeatureCacheNode:
        self.children: Set[CapaRuleGenFeatureCacheNode] = set()

    def __hash__(self):
-        # TODO: unique enough?
+        # TODO(mike-hunhoff): confirm this is unique enough
        return hash((self.address,))

    def __eq__(self, other):
        if not isinstance(other, type(self)):
            return NotImplemented
-        # TODO: unique enough?
+        # TODO(mike-hunhoff): confirm this is unique enough
        return self.address == other.address


--- a/capa/ida/plugin/form.py
+++ b/capa/ida/plugin/form.py
@@ -791,7 +791,7 @@ class CapaExplorerForm(idaapi.PluginForm):
                try:
                    # support binary files specifically for x86/AMD64 shellcode
                    # warn user binary file is loaded but still allow capa to process it
-                    # TODO: check specific architecture of binary files based on how user configured IDA processors
+                    # TODO(mike-hunhoff): check specific architecture of binary files based on how user configured IDA processors
                    if idaapi.get_file_type_name() == "Binary file":
                        logger.warning("-" * 80)
                        logger.warning(" Input file appears to be a binary file.")
--- a/capa/ida/plugin/model.py
+++ b/capa/ida/plugin/model.py
@@ -372,7 +372,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
                    display += f" ({statement.description})"
                return CapaExplorerDefaultItem(parent, display)
        elif isinstance(statement, rd.CompoundStatement) and statement.type == rd.CompoundStatementType.NOT:
-            # TODO: do we display 'not'
+            # TODO(mike-hunhoff): verify that we can display NOT statements
            pass
        elif isinstance(statement, rd.SomeStatement):
            display = f"{statement.count} or more"
@@ -421,7 +421,7 @@ class CapaExplorerDataModel(QtCore.QAbstractItemModel):
        @param doc: result doc
        """
        if not match.success:
-            # TODO: display failed branches at some point? Help with debugging rules?
+            # TODO(mike-hunhoff): display failed branches at some point? Help with debugging rules?
            return

        # optional statement with no successful children is empty
--- a/capa/ida/plugin/view.py
+++ b/capa/ida/plugin/view.py
@@ -502,7 +502,7 @@ class CapaExplorerRulegenEditor(QtWidgets.QTreeWidget):
            feature, description, comment = (o.strip() for o in tuple(o.text(i) for i in range(3)))
            rule_text += parse_node_for_feature(feature, description, comment, calc_item_depth(o))

-        # FIXME we avoid circular update by disabling signals when updating
+        # TODO(mike-hunhoff): we avoid circular update by disabling signals when updating
        # the preview. Preferably we would refactor the code to avoid this
        # in the first place
        self.preview.blockSignals(True)
--- a/capa/main.py
+++ b/capa/main.py
@@ -326,10 +326,9 @@ def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_pro
    return matches, meta


-# TODO move all to helpers?
-def has_rule_with_namespace(rules, capabilities, rule_cat):
+def has_rule_with_namespace(rules: RuleSet, capabilities: MatchResults, namespace: str) -> bool:
    for rule_name in capabilities.keys():
-        if rules.rules[rule_name].meta.get("namespace", "").startswith(rule_cat):
+        if rules.rules[rule_name].meta.get("namespace", "").startswith(namespace):
            return True
    return False

@@ -484,7 +483,6 @@ def get_workspace(path, format_, sigpaths):
    import viv_utils.flirt

    logger.debug("generating vivisect workspace for: %s", path)
-    # TODO should not be auto at this point, anymore
    if format_ == FORMAT_AUTO:
        if not is_supported_format(path):
            raise UnsupportedFormatError()
@@ -509,7 +507,6 @@ def get_workspace(path, format_, sigpaths):
    return vw


-# TODO get_extractors -> List[FeatureExtractor]?
 def get_extractor(
    path: str,
    format_: str,
@@ -1165,8 +1162,8 @@ def main(argv=None):
            rules = rules.filter_rules_by_meta(args.tag)
            logger.debug("selected %d rules", len(rules))
            for i, r in enumerate(rules.rules, 1):
-                # TODO don't display subscope rules?
                logger.debug(" %d. %s", i, r)
+
    except (IOError, capa.rules.InvalidRule, capa.rules.InvalidRuleSet) as e:
        logger.error("%s", str(e))
        logger.error(
--- a/capa/optimizer.py
+++ b/capa/optimizer.py
@@ -22,7 +22,7 @@ def get_node_cost(node):
        # substring and regex features require a full scan of each string
        # which we anticipate is more expensive then a hash lookup feature (e.g. mnemonic or count).
        #
-        # TODO: compute the average cost of these feature relative to hash feature
+        # fun research: compute the average cost of these feature relative to hash feature
        # and adjust the factor accordingly.
        return 2

--- a/capa/rules/init.py
+++ b/capa/rules/init.py
@@ -510,7 +510,9 @@ def build_statements(d, scope: str):
                # arg is string (which doesn't support inline descriptions), like:
                #
                #     count(string(error))
-                # TODO: what about embedded newlines?
+                #
+                # known problem that embedded newlines may not work here?
+                # this may become a problem (or not), so address it when encountered.
                feature = Feature(arg)
        else:
            feature = Feature()
@@ -1190,7 +1192,6 @@ class RuleSet:
                    # so thats not helpful to decide how to downselect.
                    #
                    # and, a global rule will never be the sole selector in a rule.
-                    # TODO: probably want a lint for this.
                    pass
                else:
                    # easy feature: hash lookup
--- a/scripts/capa2yara.py
+++ b/scripts/capa2yara.py
@@ -54,7 +54,7 @@ var_names = ["".join(letters) for letters in itertools.product(string.ascii_lowe

 # this have to be the internal names used by capa.py which are sometimes different to the ones written out in the rules, e.g. "2 or more" is "Some", count is Range
 unsupported = ["characteristic", "mnemonic", "offset", "subscope", "Range"]
-# TODO shorten this list, possible stuff:
+# further idea: shorten this list, possible stuff:
 # - 2 or more strings: e.g.
 # -- https://github.com/mandiant/capa-rules/blob/master/collection/file-managers/gather-direct-ftp-information.yml
 # -- https://github.com/mandiant/capa-rules/blob/master/collection/browser/gather-firefox-profile-information.yml
@@ -172,14 +172,16 @@ def convert_rule(rule, rulename, cround, depth):
            yara_strings += "\t$" + var_name + ' = "' + string + '" ascii wide' + convert_description(kid) + "\n"
            yara_condition += "\t$" + var_name + " "
        elif s_type == "api" or s_type == "import":
-            # TODO: is it possible in YARA to make a difference between api & import?
+            # research needed to decide if its possible in YARA to make a difference between api & import?

            # https://github.com/mandiant/capa-rules/blob/master/doc/format.md#api
            api = kid.value
            logger.info("doing api: %r", api)

            #    e.g. kernel32.CreateNamedPipe => look for kernel32.dll and CreateNamedPipe
-            # TODO: improve .NET API call handling
+            #
+            # note: the handling of .NET API calls could be improved here.
+            # once we have a motivation and some examples, lets do that.
            if "::" in api:
                mod, api = api.split("::")

@@ -204,7 +206,7 @@ def convert_rule(rule, rulename, cround, depth):
                var_name = "api_" + var_names.pop(0)

                # limit regex with word boundary \b but also search for appended A and W
-                # TODO: better use something like /(\\x00|\\x01|\\x02|\\x03|\\x04)' + api + '(A|W)?\\x00/  ???
+                # alternatively: use something like /(\\x00|\\x01|\\x02|\\x03|\\x04)' + api + '(A|W)?\\x00/  ???
                yara_strings += "\t$" + var_name + " = /\\b" + api + "(A|W)?\\b/ ascii wide\n"
                yara_condition += "\t$" + var_name + " "

@@ -679,8 +681,6 @@ def convert_rules(rules, namespaces, cround, make_priv):

                yara += "  condition:" + condition_header + yara_condition + "\n}"

-                # TODO: now the rule is finished and could be automatically checked with the capa-testfile(s) named in meta
-                # (doing it for all of them using yara-ci upload at the moment)
                output_yar(yara)
                converted_rules.append(rule_name)
                count_incomplete += incomplete
--- a/scripts/show-features.py
+++ b/scripts/show-features.py
@@ -142,7 +142,6 @@ def main(argv=None):

    if args.function:
        if args.format == "freeze":
-            # TODO fix
            function_handles = tuple(filter(lambda fh: fh.address == args.function, function_handles))
        else:
            function_handles = tuple(filter(lambda fh: format_address(fh.address) == args.function, function_handles))
--- a/setup.py
+++ b/setup.py
@@ -81,6 +81,7 @@ setuptools.setup(
            "flake8-logging-format==0.9.0",
            "flake8-no-implicit-concat==0.3.4",
            "flake8-print==5.0.0",
+            "flake8-todos==0.3.0",
            "ruff==0.0.275",
            "black==23.3.0",
            "isort==5.11.4",
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -653,7 +653,6 @@ FEATURE_PRESENCE_TESTS = sorted(
        # insn/api: call via jmp
        ("mimikatz", "function=0x40B3C6", capa.features.insn.API("LocalFree"), True),
        ("c91887...", "function=0x40156F", capa.features.insn.API("CloseClipboard"), True),
-        # TODO ignore thunk functions that call via jmp?
        # insn/api: resolve indirect calls
        ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.CreatePipe"), True),
        ("c91887...", "function=0x401A77", capa.features.insn.API("kernel32.SetHandleInformation"), True),