Merge pull request #1222 from mandiant/fix/issue-1221

elf: better detect linux ELF files
2025-12-12 15:49:46 -08:00 · 2022-12-12 13:28:59 +01:00
parent a46d7b3262 0f902124d1
commit d6674c7548
5 changed files with 703 additions and 235 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -50,6 +50,7 @@
 - update pydantic model to guarantee type coercion #1176 @mike-hunhoff
 - do not overwrite version in version.py during PyInstaller build #1169 @mr-tz
 - render: fix vverbose rendering of offsets #1215 @williballenthin
+- elf: better detect OS via GLIBC ABI version needed and dependencies #1221 @williballenthin

 ### capa explorer IDA Pro plugin
 - fix: display instruction items #1154 @mr-tz
--- a/capa/features/extractors/elf.py
+++ b/capa/features/extractors/elf.py
@@ -7,8 +7,11 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import struct
 import logging
+import itertools
+import collections
 from enum import Enum
-from typing import BinaryIO
+from typing import Set, Dict, List, Tuple, BinaryIO, Iterator, Optional
+from dataclasses import dataclass

 logger = logging.getLogger(__name__)

@@ -21,6 +24,12 @@ def align(v, alignment):
        return v + (alignment - remainder)


+def read_cstr(buf, offset):
+    s = buf[offset:]
+    s, _, _ = s.partition(b"\x00")
+    return s.decode("utf-8")
+
+
 class CorruptElfFile(ValueError):
    pass

@@ -60,52 +69,94 @@ GNU_ABI_TAG = {
 }


-def detect_elf_os(f) -> str:
-    """
-    f: type Union[BinaryIO, IDAIO]
-    """
-    f.seek(0x0)
-    file_header = f.read(0x40)
+@dataclass
+class Phdr:
+    type: int
+    offset: int
+    vaddr: int
+    paddr: int
+    filesz: int
+    buf: bytes

-    # we'll set this to the detected OS
-    # prefer the first heuristics,
-    # but rather than short circuiting,
-    # we'll still parse out the remainder, for debugging.
-    ret = None

-    if not file_header.startswith(b"\x7fELF"):
+@dataclass
+class Shdr:
+    name: int
+    type: int
+    flags: int
+    addr: int
+    offset: int
+    size: int
+    link: int
+    buf: bytes
+
+
+class ELF:
+    def __init__(self, f):
+        self.f = f
+
+        self.bitness: int = None
+        self.endian: str = None
+        self.e_phentsize: int = None
+        self.e_phnum: int = None
+        self.e_shentsize: int = None
+        self.e_shnum: int = None
+        self.phbuf = None
+        self.shbuf = None
+
+        self._parse()
+
+    def _parse(self):
+
+        self.f.seek(0x0)
+        self.file_header = self.f.read(0x40)
+
+        if not self.file_header.startswith(b"\x7fELF"):
            raise CorruptElfFile("missing magic header")

-    ei_class, ei_data = struct.unpack_from("BB", file_header, 4)
+        ei_class, ei_data = struct.unpack_from("BB", self.file_header, 4)
        logger.debug("ei_class: 0x%02x ei_data: 0x%02x", ei_class, ei_data)
        if ei_class == 1:
-        bitness = 32
+            self.bitness = 32
        elif ei_class == 2:
-        bitness = 64
+            self.bitness = 64
        else:
            raise CorruptElfFile("invalid ei_class: 0x%02x" % ei_class)

        if ei_data == 1:
-        endian = "<"
+            self.endian = "<"
        elif ei_data == 2:
-        endian = ">"
+            self.endian = ">"
        else:
            raise CorruptElfFile("not an ELF file: invalid ei_data: 0x%02x" % ei_data)

-    if bitness == 32:
-        (e_phoff, e_shoff) = struct.unpack_from(endian + "II", file_header, 0x1C)
-        e_phentsize, e_phnum = struct.unpack_from(endian + "HH", file_header, 0x2A)
-        e_shentsize, e_shnum = struct.unpack_from(endian + "HH", file_header, 0x2E)
-    elif bitness == 64:
-        (e_phoff, e_shoff) = struct.unpack_from(endian + "QQ", file_header, 0x20)
-        e_phentsize, e_phnum = struct.unpack_from(endian + "HH", file_header, 0x36)
-        e_shentsize, e_shnum = struct.unpack_from(endian + "HH", file_header, 0x3A)
+        if self.bitness == 32:
+            e_phoff, e_shoff = struct.unpack_from(self.endian + "II", self.file_header, 0x1C)
+            self.e_phentsize, self.e_phnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x2A)
+            self.e_shentsize, self.e_shnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x2E)
+        elif self.bitness == 64:
+            e_phoff, e_shoff = struct.unpack_from(self.endian + "QQ", self.file_header, 0x20)
+            self.e_phentsize, self.e_phnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x36)
+            self.e_shentsize, self.e_shnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x3A)
        else:
            raise NotImplementedError()

-    logger.debug("e_phoff: 0x%02x e_phentsize: 0x%02x e_phnum: %d", e_phoff, e_phentsize, e_phnum)
+        logger.debug("e_phoff: 0x%02x e_phentsize: 0x%02x e_phnum: %d", e_phoff, self.e_phentsize, self.e_phnum)
+
+        self.f.seek(e_phoff)
+        program_header_size = self.e_phnum * self.e_phentsize
+        self.phbuf = self.f.read(program_header_size)
+        if len(self.phbuf) != program_header_size:
+            logger.warning("failed to read program headers")
+            self.e_phnum = 0
+
+        self.f.seek(e_shoff)
+        section_header_size = self.e_shnum * self.e_shentsize
+        self.shbuf = self.f.read(section_header_size)
+        if len(self.shbuf) != section_header_size:
+            logger.warning("failed to read section headers")
+            self.e_shnum = 0

-    (ei_osabi,) = struct.unpack_from(endian + "B", file_header, 7)
    OSABI = {
        # via pyelftools: https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/elftools/elf/enums.py#L35-L58
        # some candidates are commented out because the are not useful values,
@@ -133,218 +184,596 @@ def detect_elf_os(f) -> str:
        # 97: "ARM",          # not an OS
        # 255: "STANDALONE",  # not an OS
    }
-    logger.debug("ei_osabi: 0x%02x (%s)", ei_osabi, OSABI.get(ei_osabi, "unknown"))

-    # os_osabi == 0 is commonly set even when the OS is not SYSV.
-    # other values are unused or unknown.
-    if ei_osabi in OSABI and ei_osabi != 0x0:
-        # subsequent strategies may overwrite this value
-        ret = OSABI[ei_osabi]
+    @property
+    def ei_osabi(self) -> Optional[OS]:
+        (ei_osabi,) = struct.unpack_from(self.endian + "B", self.file_header, 7)
+        return ELF.OSABI.get(ei_osabi)

-    f.seek(e_phoff)
-    program_header_size = e_phnum * e_phentsize
-    program_headers = f.read(program_header_size)
-    if len(program_headers) != program_header_size:
-        logger.warning("failed to read program headers")
-        e_phnum = 0
+    MACHINE = {
+        # via https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html
+        1: "M32",
+        2: "SPARC",
+        3: "i386",
+        4: "68K",
+        5: "88K",
+        6: "486",
+        7: "860",
+        8: "MIPS",
+        9: "S370",
+        10: "MIPS_RS3_LE",
+        11: "RS6000",
+        15: "PA_RISC",
+        16: "nCUBE",
+        17: "VPP500",
+        18: "SPARC32PLUS",
+        19: "960",
+        20: "PPC",
+        21: "PPC64",
+        22: "S390",
+        23: "SPU",
+        36: "V800",
+        37: "FR20",
+        38: "RH32",
+        39: "RCE",
+        40: "ARM",
+        41: "ALPHA",
+        42: "SH",
+        43: "SPARCV9",
+        44: "TRICORE",
+        45: "ARC",
+        46: "H8_300",
+        47: "H8_300H",
+        48: "H8S",
+        49: "H8_500",
+        50: "IA_64",
+        51: "MIPS_X",
+        52: "COLDFIRE",
+        53: "68HC12",
+        54: "MMA",
+        55: "PCP",
+        56: "NCPU",
+        57: "NDR1",
+        58: "STARCORE",
+        59: "ME16",
+        60: "ST100",
+        61: "TINYJ",
+        62: "amd64",
+        63: "PDSP",
+        64: "PDP10",
+        65: "PDP11",
+        66: "FX66",
+        67: "ST9PLUS",
+        68: "ST7",
+        69: "68HC16",
+        70: "68HC11",
+        71: "68HC08",
+        72: "68HC05",
+        73: "SVX",
+        74: "ST19",
+        75: "VAX",
+        76: "CRIS",
+        77: "JAVELIN",
+        78: "FIREPATH",
+        79: "ZSP",
+        80: "MMIX",
+        81: "HUANY",
+        82: "PRISM",
+        83: "AVR",
+        84: "FR30",
+        85: "D10V",
+        86: "D30V",
+        87: "V850",
+        88: "M32R",
+        89: "MN10300",
+        90: "MN10200",
+        91: "PJ",
+        92: "OPENRISC",
+        93: "ARC_A5",
+        94: "XTENSA",
+        95: "VIDEOCORE",
+        96: "TMM_GPP",
+        97: "NS32K",
+        98: "TPC",
+        99: "SNP1K",
+        100: "ST200",
+    }

-    # search for PT_NOTE sections that specify an OS
-    # for example, on Linux there is a GNU section with minimum kernel version
-    for i in range(e_phnum):
-        offset = i * e_phentsize
-        phent = program_headers[offset : offset + e_phentsize]
+    @property
+    def e_machine(self) -> Optional[str]:
+        (e_machine,) = struct.unpack_from(self.endian + "H", self.file_header, 0x12)
+        return ELF.MACHINE.get(e_machine)

-        PT_NOTE = 0x4
+    def parse_program_header(self, i) -> Phdr:
+        phent_offset = i * self.e_phentsize
+        phent = self.phbuf[phent_offset : phent_offset + self.e_phentsize]

-        (p_type,) = struct.unpack_from(endian + "I", phent, 0x0)
+        (p_type,) = struct.unpack_from(self.endian + "I", phent, 0x0)
        logger.debug("ph:p_type: 0x%04x", p_type)
-        if p_type != PT_NOTE:
-            continue

-        if bitness == 32:
-            p_offset, _, _, p_filesz = struct.unpack_from(endian + "IIII", phent, 0x4)
-        elif bitness == 64:
-            p_offset, _, _, p_filesz = struct.unpack_from(endian + "QQQQ", phent, 0x8)
+        if self.bitness == 32:
+            p_offset, p_vaddr, p_paddr, p_filesz = struct.unpack_from(self.endian + "IIII", phent, 0x4)
+        elif self.bitness == 64:
+            p_offset, p_vaddr, p_paddr, p_filesz = struct.unpack_from(self.endian + "QQQQ", phent, 0x8)
        else:
            raise NotImplementedError()

        logger.debug("ph:p_offset: 0x%02x p_filesz: 0x%04x", p_offset, p_filesz)

-        f.seek(p_offset)
-        note = f.read(p_filesz)
-        if len(note) != p_filesz:
-            logger.warning("failed to read note content")
+        self.f.seek(p_offset)
+        buf = self.f.read(p_filesz)
+        if len(buf) != p_filesz:
+            raise ValueError("failed to read program header content")
+
+        return Phdr(p_type, p_offset, p_vaddr, p_paddr, p_filesz, buf)
+
+    @property
+    def program_headers(self):
+        for i in range(self.e_phnum):
+            try:
+                yield self.parse_program_header(i)
+            except ValueError:
                continue

-        namesz, descsz, type_ = struct.unpack_from(endian + "III", note, 0x0)
-        name_offset = 0xC
-        desc_offset = name_offset + align(namesz, 0x4)
+    def parse_section_header(self, i) -> Shdr:
+        shent_offset = i * self.e_shentsize
+        shent = self.shbuf[shent_offset : shent_offset + self.e_shentsize]

-        logger.debug("ph:namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, descsz, type_)
-
-        name = note[name_offset : name_offset + namesz].partition(b"\x00")[0].decode("ascii")
-        logger.debug("name: %s", name)
-
-        if type_ != 1:
-            continue
-
-        if name == "GNU":
-            if descsz < 16:
-                continue
-
-            desc = note[desc_offset : desc_offset + descsz]
-            abi_tag, kmajor, kminor, kpatch = struct.unpack_from(endian + "IIII", desc, 0x0)
-            logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag)
-
-            if abi_tag in GNU_ABI_TAG:
-                # update only if not set
-                # so we can get the debugging output of subsequent strategies
-                ret = GNU_ABI_TAG[abi_tag] if not ret else ret
-                logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", ret, kmajor, kminor, kpatch)
-        elif name == "OpenBSD":
-            logger.debug("note owner: %s", "OPENBSD")
-            ret = OS.OPENBSD if not ret else ret
-        elif name == "NetBSD":
-            logger.debug("note owner: %s", "NETBSD")
-            ret = OS.NETBSD if not ret else ret
-        elif name == "FreeBSD":
-            logger.debug("note owner: %s", "FREEBSD")
-            ret = OS.FREEBSD if not ret else ret
-
-    # search for recognizable dynamic linkers (interpreters)
-    # for example, on linux, we see file paths like: /lib64/ld-linux-x86-64.so.2
-    for i in range(e_phnum):
-        offset = i * e_phentsize
-        phent = program_headers[offset : offset + e_phentsize]
-
-        PT_INTERP = 0x3
-
-        (p_type,) = struct.unpack_from(endian + "I", phent, 0x0)
-        if p_type != PT_INTERP:
-            continue
-
-        if bitness == 32:
-            p_offset, _, _, p_filesz = struct.unpack_from(endian + "IIII", phent, 0x4)
-        elif bitness == 64:
-            p_offset, _, _, p_filesz = struct.unpack_from(endian + "QQQQ", phent, 0x8)
+        if self.bitness == 32:
+            sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link = struct.unpack_from(
+                self.endian + "IIIIIII", shent, 0x0
+            )
+        elif self.bitness == 64:
+            sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link = struct.unpack_from(
+                self.endian + "IIQQQQI", shent, 0x0
+            )
        else:
            raise NotImplementedError()

-        f.seek(p_offset)
-        interp = f.read(p_filesz)
-        if len(interp) != p_filesz:
-            logger.warning("failed to read interp content")
-            continue
-
-        linker = interp.partition(b"\x00")[0].decode("ascii")
-        logger.debug("linker: %s", linker)
-        if "ld-linux" in linker:
-            # update only if not set
-            # so we can get the debugging output of subsequent strategies
-            ret = OS.LINUX if ret is None else ret
-
-    f.seek(e_shoff)
-    section_header_size = e_shnum * e_shentsize
-    section_headers = f.read(section_header_size)
-    if len(section_headers) != section_header_size:
-        logger.warning("failed to read section headers")
-        e_shnum = 0
-
-    # search for notes stored in sections that aren't visible in program headers.
-    # e.g. .note.Linux in Linux kernel modules.
-    for i in range(e_shnum):
-        offset = i * e_shentsize
-        shent = section_headers[offset : offset + e_shentsize]
-
-        if bitness == 32:
-            sh_name, sh_type, _, sh_addr, sh_offset, sh_size = struct.unpack_from(endian + "IIIIII", shent, 0x0)
-        elif bitness == 64:
-            sh_name, sh_type, _, sh_addr, sh_offset, sh_size = struct.unpack_from(endian + "IIQQQQ", shent, 0x0)
-        else:
-            raise NotImplementedError()
-
-        SHT_NOTE = 0x7
-        if sh_type != SHT_NOTE:
-            continue
-
        logger.debug("sh:sh_offset: 0x%02x sh_size: 0x%04x", sh_offset, sh_size)

-        f.seek(sh_offset)
-        note = f.read(sh_size)
-        if len(note) != sh_size:
-            logger.warning("failed to read note content")
+        self.f.seek(sh_offset)
+        buf = self.f.read(sh_size)
+        if len(buf) != sh_size:
+            raise ValueError("failed to read section header content")
+
+        return Shdr(sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link, buf)
+
+    @property
+    def section_headers(self):
+        for i in range(self.e_shnum):
+            try:
+                yield self.parse_section_header(i)
+            except ValueError:
                continue

-        namesz, descsz, type_ = struct.unpack_from(endian + "III", note, 0x0)
+    @property
+    def linker(self):
+        PT_INTERP = 0x3
+        for phdr in self.program_headers:
+            if phdr.type != PT_INTERP:
+                continue
+
+            return read_cstr(phdr.buf, 0)
+
+    @property
+    def versions_needed(self) -> Dict[str, Set[str]]:
+        # symbol version requirements are stored in the .gnu.version_r section,
+        # which has type SHT_GNU_verneed (0x6ffffffe).
+        #
+        # this contains a linked list of ElfXX_Verneed structs,
+        # each referencing a linked list of ElfXX_Vernaux structs.
+        # strings are stored in the section referenced by the sh_link field of the section header.
+        # each Verneed struct contains a reference to the name of the library,
+        # each Vernaux struct contains a reference to the name of a symbol.
+        SHT_GNU_VERNEED = 0x6FFFFFFE
+        for shdr in self.section_headers:
+            if shdr.type != SHT_GNU_VERNEED:
+                continue
+
+            # the linked section contains strings referenced by the verneed structures.
+            linked_shdr = self.parse_section_header(shdr.link)
+
+            versions_needed = collections.defaultdict(set)
+
+            # read verneed structures from the start of the section
+            # until the vn_next link is 0x0.
+            # each entry describes a shared object that is required by this binary.
+            vn_offset = 0x0
+            while True:
+                # ElfXX_Verneed layout is the same on 32 and 64 bit
+                vn_version, vn_cnt, vn_file, vn_aux, vn_next = struct.unpack_from(
+                    self.endian + "HHIII", shdr.buf, vn_offset
+                )
+                if vn_version != 1:
+                    # unexpected format, don't try to keep parsing
+                    break
+
+                # shared object names, like: "libdl.so.2"
+                so_name = read_cstr(linked_shdr.buf, vn_file)
+
+                # read vernaux structures linked from the verneed structure.
+                # there should be vn_cnt of these.
+                # each entry describes an ABI name required by the shared object.
+                vna_offset = vn_offset + vn_aux
+                for i in range(vn_cnt):
+                    # ElfXX_Vernaux layout is the same on 32 and 64 bit
+                    _, _, _, vna_name, vna_next = struct.unpack_from(self.endian + "IHHII", shdr.buf, vna_offset)
+
+                    # ABI names, like: "GLIBC_2.2.5"
+                    abi = read_cstr(linked_shdr.buf, vna_name)
+                    versions_needed[so_name].add(abi)
+
+                    vna_offset += vna_next
+
+                vn_offset += vn_next
+                if vn_next == 0:
+                    break
+
+            return dict(versions_needed)
+
+        return {}
+
+    @property
+    def dynamic_entries(self) -> Iterator[Tuple[int, int]]:
+        """
+        read the entries from the dynamic section,
+        yielding the tag and value for each entry.
+        """
+        DT_NULL = 0x0
+        PT_DYNAMIC = 0x2
+        for phdr in self.program_headers:
+            if phdr.type != PT_DYNAMIC:
+                continue
+
+            offset = 0x0
+            while True:
+                if self.bitness == 32:
+                    d_tag, d_val = struct.unpack_from(self.endian + "II", phdr.buf, offset)
+                    offset += 8
+                elif self.bitness == 64:
+                    d_tag, d_val = struct.unpack_from(self.endian + "QQ", phdr.buf, offset)
+                    offset += 16
+                else:
+                    raise NotImplementedError()
+
+                if d_tag == DT_NULL:
+                    break
+
+                yield d_tag, d_val
+
+    @property
+    def strtab(self) -> Optional[bytes]:
+        """
+        fetch the bytes of the string table
+        referenced by the dynamic section.
+        """
+        DT_STRTAB = 0x5
+        DT_STRSZ = 0xA
+
+        strtab_addr = None
+        strtab_size = None
+
+        for d_tag, d_val in self.dynamic_entries:
+            if d_tag == DT_STRTAB:
+                strtab_addr = d_val
+
+        for d_tag, d_val in self.dynamic_entries:
+            if d_tag == DT_STRSZ:
+                strtab_size = d_val
+
+        if strtab_addr is None:
+            return None
+
+        if strtab_size is None:
+            return None
+
+        strtab_offset = None
+        for shdr in self.section_headers:
+            if shdr.addr <= strtab_addr < shdr.addr + shdr.size:
+                strtab_offset = shdr.offset + (strtab_addr - shdr.addr)
+
+        if strtab_offset is None:
+            return None
+
+        self.f.seek(strtab_offset)
+        strtab_buf = self.f.read(strtab_size)
+
+        if len(strtab_buf) != strtab_size:
+            return None
+
+        return strtab_buf
+
+    @property
+    def needed(self) -> Iterator[str]:
+        """
+        read the names of DT_NEEDED entries from the dynamic section,
+        which correspond to dependencies on other shared objects,
+        like: `libpthread.so.0`
+        """
+        DT_NEEDED = 0x1
+        strtab = self.strtab
+        if not strtab:
+            return
+
+        for d_tag, d_val in self.dynamic_entries:
+            if d_tag != DT_NEEDED:
+                continue
+
+            yield read_cstr(strtab, d_val)
+
+
+@dataclass
+class ABITag:
+    os: OS
+    kmajor: int
+    kminor: int
+    kpatch: int
+
+
+class PHNote:
+    def __init__(self, endian, buf):
+        self.endian = endian
+        self.buf = buf
+
+        self.type_: int = None
+        self.descsz: int = None
+        self.name: str = None
+
+        self._parse()
+
+    def _parse(self):
+        namesz, self.descsz, self.type_ = struct.unpack_from(self.endian + "III", self.buf, 0x0)
        name_offset = 0xC
-        desc_offset = name_offset + align(namesz, 0x4)
+        self.desc_offset = name_offset + align(namesz, 0x4)

-        logger.debug("sh:namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, descsz, type_)
+        logger.debug("ph:namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, self.descsz, self.type_)

-        name = note[name_offset : name_offset + namesz].partition(b"\x00")[0].decode("ascii")
+        name = self.buf[name_offset : name_offset + namesz].partition(b"\x00")[0].decode("ascii")
        logger.debug("name: %s", name)

-        if name == "Linux":
-            logger.debug("note owner: %s", "LINUX")
-            ret = OS.LINUX if not ret else ret
-        elif name == "OpenBSD":
-            logger.debug("note owner: %s", "OPENBSD")
-            ret = OS.OPENBSD if not ret else ret
-        elif name == "NetBSD":
-            logger.debug("note owner: %s", "NETBSD")
-            ret = OS.NETBSD if not ret else ret
-        elif name == "FreeBSD":
-            logger.debug("note owner: %s", "FREEBSD")
-            ret = OS.FREEBSD if not ret else ret
-        elif name == "GNU":
-            if descsz < 16:
-                continue
+    @property
+    def abi_tag(self) -> Optional[ABITag]:
+        if self.type_ != 1:
+            # > The type field shall be 1.
+            # Linux Standard Base Specification 1.2
+            # ref: https://refspecs.linuxfoundation.org/LSB_1.2.0/gLSB/noteabitag.html
+            return None

-            desc = note[desc_offset : desc_offset + descsz]
-            abi_tag, kmajor, kminor, kpatch = struct.unpack_from(endian + "IIII", desc, 0x0)
+        if self.name != "GNU":
+            return None
+
+        if self.descsz < 16:
+            return None
+
+        desc = self.buf[self.desc_offset : self.desc_offset + self.descsz]
+        abi_tag, kmajor, kminor, kpatch = struct.unpack_from(self.endian + "IIII", desc, 0x0)
        logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag)

-            if abi_tag in GNU_ABI_TAG:
-                # update only if not set
-                # so we can get the debugging output of subsequent strategies
-                ret = GNU_ABI_TAG[abi_tag] if not ret else ret
-                logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", ret, kmajor, kminor, kpatch)
+        os = GNU_ABI_TAG.get(abi_tag)
+        if not os:
+            return None
+
+        logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", os, kmajor, kminor, kpatch)
+
+        return ABITag(os, kmajor, kminor, kpatch)
+
+
+class SHNote:
+    def __init__(self, endian, buf):
+        self.endian = endian
+        self.buf = buf
+
+        self.type_: int = None
+        self.descsz: int = None
+        self.name: str = None
+
+        self._parse()
+
+    def _parse(self):
+        namesz, self.descsz, self.type_ = struct.unpack_from(self.endian + "III", self.buf, 0x0)
+        name_offset = 0xC
+        self.desc_offset = name_offset + align(namesz, 0x4)
+
+        logger.debug("sh:namesz: 0x%02x descsz: 0x%02x type: 0x%04x", namesz, self.descsz, self.type_)
+
+        name_buf = self.buf[name_offset : name_offset + namesz]
+        self.name = read_cstr(name_buf, 0x0)
+        logger.debug("sh:name: %s", self.name)
+
+    @property
+    def abi_tag(self) -> Optional[ABITag]:
+        if self.name != "GNU":
+            return None
+
+        if self.descsz < 16:
+            return None
+
+        desc = self.buf[self.desc_offset : self.desc_offset + self.descsz]
+        abi_tag, kmajor, kminor, kpatch = struct.unpack_from(self.endian + "IIII", desc, 0x0)
+        logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag)
+
+        os = GNU_ABI_TAG.get(abi_tag)
+        if not os:
+            return None
+
+        logger.debug("abi tag: %s earliest compatible kernel: %d.%d.%d", os, kmajor, kminor, kpatch)
+        return ABITag(os, kmajor, kminor, kpatch)
+
+
+def guess_os_from_osabi(elf) -> Optional[OS]:
+    return elf.ei_osabi
+
+
+def guess_os_from_ph_notes(elf) -> Optional[OS]:
+    # search for PT_NOTE sections that specify an OS
+    # for example, on Linux there is a GNU section with minimum kernel version
+    PT_NOTE = 0x4
+    for phdr in elf.program_headers:
+        if phdr.type != PT_NOTE:
+            continue
+
+        note = PHNote(elf.endian, phdr.buf)
+
+        if note.type_ != 1:
+            # > The type field shall be 1.
+            # Linux Standard Base Specification 1.2
+            # ref: https://refspecs.linuxfoundation.org/LSB_1.2.0/gLSB/noteabitag.html
+            continue
+
+        if note.name == "Linux":
+            logger.debug("note owner: %s", "LINUX")
+            return OS.LINUX
+        elif note.name == "OpenBSD":
+            logger.debug("note owner: %s", "OPENBSD")
+            return OS.OPENBSD
+        elif note.name == "NetBSD":
+            logger.debug("note owner: %s", "NETBSD")
+            return OS.NETBSD
+        elif note.name == "FreeBSD":
+            logger.debug("note owner: %s", "FREEBSD")
+            return OS.FREEBSD
+        elif note.name == "GNU":
+            abi_tag = note.abi_tag
+            if abi_tag:
+                return abi_tag.os
+            else:
+                # cannot make a guess about the OS, but probably linux or hurd
+                pass
+
+    return None
+
+
+def guess_os_from_sh_notes(elf) -> Optional[OS]:
+    # search for notes stored in sections that aren't visible in program headers.
+    # e.g. .note.Linux in Linux kernel modules.
+    SHT_NOTE = 0x7
+    for shdr in elf.section_headers:
+        if shdr.type != SHT_NOTE:
+            continue
+
+        note = SHNote(elf.endian, shdr.buf)
+
+        if note.name == "Linux":
+            logger.debug("note owner: %s", "LINUX")
+            return OS.LINUX
+        elif note.name == "OpenBSD":
+            logger.debug("note owner: %s", "OPENBSD")
+            return OS.OPENBSD
+        elif note.name == "NetBSD":
+            logger.debug("note owner: %s", "NETBSD")
+            return OS.NETBSD
+        elif note.name == "FreeBSD":
+            logger.debug("note owner: %s", "FREEBSD")
+            return OS.FREEBSD
+        elif note.name == "GNU":
+            abi_tag = note.abi_tag
+            if abi_tag:
+                return abi_tag.os
+            else:
+                # cannot make a guess about the OS, but probably linux or hurd
+                pass
+
+    return None
+
+
+def guess_os_from_linker(elf) -> Optional[OS]:
+    # search for recognizable dynamic linkers (interpreters)
+    # for example, on linux, we see file paths like: /lib64/ld-linux-x86-64.so.2
+    linker = elf.linker
+    if linker and "ld-linux" in elf.linker:
+        return OS.LINUX
+
+    return None
+
+
+def guess_os_from_abi_versions_needed(elf) -> Optional[OS]:
+    # then lets look for GLIBC symbol versioning requirements.
+    # this will let us guess about linux/hurd in some cases.
+
+    versions_needed = elf.versions_needed
+    if any(map(lambda abi: abi.startswith("GLIBC"), itertools.chain(*versions_needed.values()))):
+        # there are any GLIBC versions needed
+
+        if elf.e_machine != "i386":
+            # GLIBC runs on Linux and Hurd.
+            # for Hurd, its *only* on i386.
+            # so if we're not on i386, then we're on Linux.
+            return OS.LINUX
+
+        else:
+            # we're on i386, so we could be on either Linux or Hurd.
+            linker = elf.linker
+
+            if linker and "ld-linux" in linker:
+                return OS.LINUX
+
+            elif linker and "/ld.so" in linker:
+                return OS.HURD
+
+            else:
+                # we don't have any good guesses based on versions needed
+                pass
+
+    return None
+
+
+def guess_os_from_needed_dependencies(elf) -> Optional[OS]:
+    for needed in elf.needed:
+        if needed.startswith("libmachuser.so"):
+            return OS.HURD
+        if needed.startswith("libhurduser.so"):
+            return OS.HURD
+
+    return None
+
+
+def detect_elf_os(f) -> str:
+    """
+    f: type Union[BinaryIO, IDAIO]
+    """
+    elf = ELF(f)
+
+    osabi_guess = guess_os_from_osabi(elf)
+    logger.debug("guess: osabi: %s", osabi_guess)
+
+    ph_notes_guess = guess_os_from_ph_notes(elf)
+    logger.debug("guess: ph notes: %s", ph_notes_guess)
+
+    sh_notes_guess = guess_os_from_sh_notes(elf)
+    logger.debug("guess: sh notes: %s", sh_notes_guess)
+
+    linker_guess = guess_os_from_linker(elf)
+    logger.debug("guess: linker: %s", linker_guess)
+
+    abi_versions_needed_guess = guess_os_from_abi_versions_needed(elf)
+    logger.debug("guess: ABI versions needed: %s", abi_versions_needed_guess)
+
+    needed_dependencies_guess = guess_os_from_needed_dependencies(elf)
+    logger.debug("guess: needed dependencies: %s", needed_dependencies_guess)
+
+    ret = None
+
+    if osabi_guess:
+        ret = osabi_guess
+
+    elif ph_notes_guess:
+        ret = ph_notes_guess
+
+    elif sh_notes_guess:
+        ret = sh_notes_guess
+
+    elif linker_guess:
+        ret = linker_guess
+
+    elif abi_versions_needed_guess:
+        ret = abi_versions_needed_guess
+
+    elif needed_dependencies_guess:
+        ret = needed_dependencies_guess

    return ret.value if ret is not None else "unknown"


-class Arch(str, Enum):
-    I386 = "i386"
-    AMD64 = "amd64"
-
-
 def detect_elf_arch(f: BinaryIO) -> str:
-    f.seek(0x0)
-    file_header = f.read(0x40)
-
-    if not file_header.startswith(b"\x7fELF"):
-        raise CorruptElfFile("missing magic header")
-
-    (ei_data,) = struct.unpack_from("B", file_header, 5)
-    logger.debug("ei_data: 0x%02x", ei_data)
-
-    if ei_data == 1:
-        endian = "<"
-    elif ei_data == 2:
-        endian = ">"
-    else:
-        raise CorruptElfFile("not an ELF file: invalid ei_data: 0x%02x" % ei_data)
-
-    (ei_machine,) = struct.unpack_from(endian + "H", file_header, 0x12)
-    logger.debug("ei_machine: 0x%02x", ei_machine)
-
-    EM_386 = 0x3
-    EM_X86_64 = 0x3E
-    if ei_machine == EM_386:
-        return Arch.I386
-    elif ei_machine == EM_X86_64:
-        return Arch.AMD64
-    else:
-        # not really unknown, but unsupport at the moment:
-        # https://github.com/eliben/pyelftools/blob/ab444d982d1849191e910299a985989857466620/elftools/elf/enums.py#L73
-        return "unknown"
+    return ELF(f).e_machine or "unknown"
--- a/capa/features/extractors/elffile.py
+++ b/capa/features/extractors/elffile.py
@@ -7,7 +7,6 @@
 # See the License for the specific language governing permissions and limitations under the License.
 import io
 import logging
-import contextlib
 from typing import Tuple, Iterator

 from elftools.elf.elffile import ELFFile, SymbolTableSection
@@ -16,7 +15,6 @@ import capa.features.extractors.common
 from capa.features.file import Import, Section
 from capa.features.common import OS, FORMAT_ELF, Arch, Format, Feature
 from capa.features.address import NO_ADDRESS, FileOffsetAddress, AbsoluteVirtualAddress
-from capa.features.extractors.elf import Arch as ElfArch
 from capa.features.extractors.base_extractor import FeatureExtractor

 logger = logging.getLogger(__name__)
@@ -26,17 +24,17 @@ def extract_file_import_names(elf, **kwargs):
    # see https://github.com/eliben/pyelftools/blob/0664de05ed2db3d39041e2d51d19622a8ef4fb0f/scripts/readelf.py#L372
    symbol_tables = [(idx, s) for idx, s in enumerate(elf.iter_sections()) if isinstance(s, SymbolTableSection)]

-    for section_index, section in symbol_tables:
+    for _, section in symbol_tables:
        if not isinstance(section, SymbolTableSection):
            continue

        if section["sh_entsize"] == 0:
-            logger.debug("Symbol table '%s' has a sh_entsize of zero!" % (section.name))
+            logger.debug("Symbol table '%s' has a sh_entsize of zero!", section.name)
            continue

-        logger.debug("Symbol table '%s' contains %s entries:" % (section.name, section.num_symbols()))
+        logger.debug("Symbol table '%s' contains %s entries:", section.name, section.num_symbols())

-        for nsym, symbol in enumerate(section.iter_symbols()):
+        for _, symbol in enumerate(section.iter_symbols()):
            if symbol.name and symbol.entry.st_info.type == "STT_FUNC":
                # TODO symbol address
                # TODO symbol version info?
@@ -73,9 +71,9 @@ def extract_file_arch(elf, **kwargs):
    # TODO merge with capa.features.extractors.elf.detect_elf_arch()
    arch = elf.get_machine_arch()
    if arch == "x86":
-        yield Arch(ElfArch.I386), NO_ADDRESS
+        yield Arch("i386"), NO_ADDRESS
    elif arch == "x64":
-        yield Arch(ElfArch.AMD64), NO_ADDRESS
+        yield Arch("amd64"), NO_ADDRESS
    else:
        logger.warning("unsupported architecture: %s", arch)

@@ -153,8 +151,8 @@ class ElfFeatureExtractor(FeatureExtractor):
    def extract_insn_features(self, f, bb, insn):
        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")

-    def is_library_function(self, va):
+    def is_library_function(self, addr):
        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")

-    def get_function_name(self, va):
+    def get_function_name(self, addr):
        raise NotImplementedError("ElfFeatureExtractor can only be used to extract file features")
--- a/tests/fixtures.py
+++ b/tests/fixtures.py
@@ -284,6 +284,10 @@ def get_data_path_by_name(name):
        return os.path.join(CD, "data", "0953cc3b77ed2974b09e3a00708f88de931d681e2d0cb64afbaf714610beabe6.exe_")
    elif name.startswith("_039a6"):
        return os.path.join(CD, "data", "039a6336d0802a2255669e6867a5679c7eb83313dbc61fb1c7232147379bd304.exe_")
+    elif name.startswith("b5f052"):
+        return os.path.join(CD, "data", "b5f0524e69b3a3cf636c7ac366ca57bf5e3a8fdc8a9f01caf196c611a7918a87.elf_")
+    elif name.startswith("bf7a9c"):
+        return os.path.join(CD, "data", "bf7a9c8bdfa6d47e01ad2b056264acc3fd90cf43fe0ed8deec93ab46b47d76cb.elf_")
    else:
        raise ValueError("unexpected sample fixture: %s" % name)

--- a/tests/test_os_detection.py
+++ b/tests/test_os_detection.py
@@ -14,13 +14,49 @@ from fixtures import *
 import capa.features.extractors.elf


-def test_elf_section_gnu_abi_tag():
+def test_elf_sh_notes():
+    # guess: osabi: None
+    # guess: ph notes: None
+    # guess: sh notes: OS.LINUX
+    # guess: linker: None
+    # guess: ABI versions needed: None
+    # guess: needed dependencies: None
    path = get_data_path_by_name("2f7f5f")
    with open(path, "rb") as f:
        assert capa.features.extractors.elf.detect_elf_os(f) == "linux"


-def test_elf_program_header_gnu_abi_tag():
+def test_elf_pt_notes():
+    # guess: osabi: None
+    # guess: ph notes: None
+    # guess: sh notes: OS.LINUX
+    # guess: linker: OS.LINUX
+    # guess: ABI versions needed: OS.LINUX
+    # guess: needed dependencies: None
    path = get_data_path_by_name("7351f.elf")
    with open(path, "rb") as f:
        assert capa.features.extractors.elf.detect_elf_os(f) == "linux"
+
+
+def test_elf_so_needed():
+    # guess: osabi: None
+    # guess: ph notes: None
+    # guess: sh notes: OS.HURD
+    # guess: linker: None
+    # guess: ABI versions needed: OS.HURD
+    # guess: needed dependencies: OS.HURD
+    path = get_data_path_by_name("b5f052")
+    with open(path, "rb") as f:
+        assert capa.features.extractors.elf.detect_elf_os(f) == "hurd"
+
+
+def test_elf_abi_version_hurd():
+    # guess: osabi: None
+    # guess: ph notes: None
+    # guess: sh notes: OS.HURD
+    # guess: linker: None
+    # guess: ABI versions needed: OS.HURD
+    # guess: needed dependencies: None
+    path = get_data_path_by_name("bf7a9c")
+    with open(path, "rb") as f:
+        assert capa.features.extractors.elf.detect_elf_os(f) == "unknown"