From c71139f2a3723f73528bc3a803cc2281589ad7c9 Mon Sep 17 00:00:00 2001 From: vibhatsu Date: Thu, 30 Jan 2025 23:04:56 +0530 Subject: [PATCH 1/5] refactor: replace binascii with bytes for hex conversions Signed-off-by: vibhatsu --- capa/features/extractors/cape/models.py | 3 +-- capa/features/freeze/features.py | 5 ++--- scripts/import-to-ida.py | 3 +-- tests/fixtures.py | 9 ++++----- tests/test_binexport_features.py | 3 +-- tests/test_ida_features.py | 3 +-- 6 files changed, 10 insertions(+), 16 deletions(-) diff --git a/capa/features/extractors/cape/models.py b/capa/features/extractors/cape/models.py index 7117cc935..bfb3e21d6 100644 --- a/capa/features/extractors/cape/models.py +++ b/capa/features/extractors/cape/models.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import binascii from typing import Any, Union, Literal, Optional, Annotated, TypeAlias from pydantic import Field, BaseModel, ConfigDict @@ -27,7 +26,7 @@ def validate_hex_int(value): def validate_hex_bytes(value): - return binascii.unhexlify(value) if isinstance(value, str) else value + return bytes.fromhex(value) if isinstance(value, str) else value HexInt = Annotated[int, BeforeValidator(validate_hex_int)] diff --git a/capa/features/freeze/features.py b/capa/features/freeze/features.py index 683023944..151964e55 100644 --- a/capa/features/freeze/features.py +++ b/capa/features/freeze/features.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import binascii from typing import Union, Literal, Optional, Annotated from pydantic import Field, BaseModel, ConfigDict @@ -85,7 +84,7 @@ def to_capa(self) -> capa.features.common.Feature: return capa.features.insn.Number(self.number, description=self.description) elif isinstance(self, BytesFeature): - return capa.features.common.Bytes(binascii.unhexlify(self.bytes), description=self.description) + return capa.features.common.Bytes(bytes.fromhex(self.bytes), description=self.description) elif isinstance(self, OffsetFeature): return capa.features.insn.Offset(self.offset, description=self.description) @@ -191,7 +190,7 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature": elif isinstance(f, capa.features.common.Bytes): buf = f.value assert isinstance(buf, bytes) - return BytesFeature(bytes=binascii.hexlify(buf).decode("ascii"), description=f.description) + return BytesFeature(bytes=bytes.hex(buf), description=f.description) elif isinstance(f, capa.features.insn.Offset): assert isinstance(f.value, int) diff --git a/scripts/import-to-ida.py b/scripts/import-to-ida.py index 89ba19454..3c468c414 100644 --- a/scripts/import-to-ida.py +++ b/scripts/import-to-ida.py @@ -36,7 +36,6 @@ """ import logging -import binascii from pathlib import Path import ida_nalt @@ -85,7 +84,7 @@ def main(): # # see: https://github.com/idapython/bin/issues/11 a = meta.sample.md5.lower() - b = binascii.hexlify(ida_nalt.retrieve_input_file_md5()).decode("ascii").lower() + b = bytes.hex(ida_nalt.retrieve_input_file_md5()).lower() if not a.startswith(b): logger.error("sample mismatch") return -2 diff --git a/tests/fixtures.py b/tests/fixtures.py index 187a5f05f..b9199061d 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -13,7 +13,6 @@ # limitations under the License. -import binascii import contextlib import collections from pathlib import Path @@ -942,17 +941,17 @@ def parametrize(params, values, **kwargs): # insn/string, direct memory reference ("mimikatz", "function=0x46D6CE", capa.features.common.String("(null)"), True), # insn/bytes - ("mimikatz", "function=0x401517", capa.features.common.Bytes(binascii.unhexlify("CA3B0E000000F8AF47")), True), - ("mimikatz", "function=0x404414", capa.features.common.Bytes(binascii.unhexlify("0180000040EA4700")), True), + ("mimikatz", "function=0x401517", capa.features.common.Bytes(bytes.fromhex("CA3B0E000000F8AF47")), True), + ("mimikatz", "function=0x404414", capa.features.common.Bytes(bytes.fromhex("0180000040EA4700")), True), # don't extract byte features for obvious strings ("mimikatz", "function=0x40105D", capa.features.common.Bytes("SCardControl".encode("utf-16le")), False), ("mimikatz", "function=0x40105D", capa.features.common.Bytes("SCardTransmit".encode("utf-16le")), False), ("mimikatz", "function=0x40105D", capa.features.common.Bytes("ACR > ".encode("utf-16le")), False), ("mimikatz", "function=0x40105D", capa.features.common.Bytes("nope".encode("ascii")), False), # push offset aAcsAcr1220 ; "ACS..." -> where ACS == 41 00 43 00 == valid pointer to middle of instruction - ("mimikatz", "function=0x401000", capa.features.common.Bytes(binascii.unhexlify("FDFF59F647")), False), + ("mimikatz", "function=0x401000", capa.features.common.Bytes(bytes.fromhex("FDFF59F647")), False), # IDA features included byte sequences read from invalid memory, fixed in #409 - ("mimikatz", "function=0x44570F", capa.features.common.Bytes(binascii.unhexlify("FF" * 256)), False), + ("mimikatz", "function=0x44570F", capa.features.common.Bytes(bytes.fromhex("FF" * 256)), False), # insn/bytes, pointer to string bytes ("mimikatz", "function=0x44EDEF", capa.features.common.Bytes("INPUTEVENT".encode("utf-16le")), False), # insn/characteristic(nzxor) diff --git a/tests/test_binexport_features.py b/tests/test_binexport_features.py index 8e76732ce..2695230c0 100644 --- a/tests/test_binexport_features.py +++ b/tests/test_binexport_features.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import binascii from typing import cast import pytest @@ -302,7 +301,7 @@ ( "d1e650.ghidra.be2", "function=0x1165a4", - capa.features.common.Bytes(binascii.unhexlify("E405B89370BA6B419CD7925275BF6FCC1E8360CC")), + capa.features.common.Bytes(bytes.fromhex("E405B89370BA6B419CD7925275BF6FCC1E8360CC")), True, ), # # don't extract byte features for obvious strings diff --git a/tests/test_ida_features.py b/tests/test_ida_features.py index da1a2ca4b..5e2f7380a 100644 --- a/tests/test_ida_features.py +++ b/tests/test_ida_features.py @@ -60,7 +60,6 @@ import sys import inspect import logging -import binascii import traceback from pathlib import Path @@ -86,7 +85,7 @@ def check_input_file(wanted): except UnicodeDecodeError: # in IDA 7.5 or so, GetInputFileMD5 started returning raw binary # rather than the hex digest - found = binascii.hexlify(idautils.GetInputFileMD5()[:15]).decode("ascii").lower() + found = bytes.hex(idautils.GetInputFileMD5()[:15]).lower() if not wanted.startswith(found): raise RuntimeError(f"please run the tests against sample with MD5: `{wanted}`") From 483f8c9a85879d0d2eb0ac34fd5a5c634e4836f8 Mon Sep 17 00:00:00 2001 From: vibhatsu Date: Thu, 30 Jan 2025 23:07:33 +0530 Subject: [PATCH 2/5] refactor: replace struct unpacking with bytes conversion Signed-off-by: vibhatsu --- capa/features/extractors/common.py | 5 +- capa/features/extractors/elf.py | 247 ++++++++++++------ capa/features/extractors/ghidra/basicblock.py | 9 +- capa/features/extractors/ghidra/file.py | 3 +- capa/features/extractors/helpers.py | 3 +- capa/features/extractors/ida/basicblock.py | 9 +- capa/features/extractors/ida/file.py | 3 +- capa/features/extractors/viv/basicblock.py | 9 +- 8 files changed, 190 insertions(+), 98 deletions(-) diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py index f8918b8d8..9f25243d9 100644 --- a/capa/features/extractors/common.py +++ b/capa/features/extractors/common.py @@ -15,7 +15,6 @@ import io import re import logging -import binascii import contextlib from typing import Iterator @@ -114,7 +113,7 @@ def extract_arch(buf) -> Iterator[tuple[Feature, Address]]: # rules that rely on arch conditions will fail to match on shellcode. # # for (2), this logic will need to be updated as the format is implemented. - logger.debug("unsupported file format: %s, will not guess Arch", binascii.hexlify(buf[:4]).decode("ascii")) + logger.debug("unsupported file format: %s, will not guess Arch", bytes.hex(buf[:4])) return @@ -145,5 +144,5 @@ def extract_os(buf, os=OS_AUTO) -> Iterator[tuple[Feature, Address]]: # rules that rely on OS conditions will fail to match on shellcode. # # for (2), this logic will need to be updated as the format is implemented. - logger.debug("unsupported file format: %s, will not guess OS", binascii.hexlify(buf[:4]).decode("ascii")) + logger.debug("unsupported file format: %s, will not guess OS", bytes.hex(buf[:4])) return diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py index a3d52082e..15e655b23 100644 --- a/capa/features/extractors/elf.py +++ b/capa/features/extractors/elf.py @@ -12,12 +12,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import struct import logging import itertools import collections from enum import Enum -from typing import TYPE_CHECKING, BinaryIO, Iterator, Optional +from typing import TYPE_CHECKING, Literal, BinaryIO, Iterator, Optional from dataclasses import dataclass if TYPE_CHECKING: @@ -132,7 +131,7 @@ def __init__(self, f: BinaryIO): # these will all be initialized in `_parse()` self.bitness: int - self.endian: str + self.endian: Literal["little", "big"] self.e_phentsize: int self.e_phnum: int self.e_shentsize: int @@ -150,7 +149,7 @@ def _parse(self): if not self.file_header.startswith(b"\x7fELF"): raise CorruptElfFile("missing magic header") - ei_class, ei_data = struct.unpack_from("BB", self.file_header, 4) + ei_class, ei_data = int.from_bytes(self.file_header[4:5]), int.from_bytes(self.file_header[5:6]) logger.debug("ei_class: 0x%02x ei_data: 0x%02x", ei_class, ei_data) if ei_class == 1: self.bitness = 32 @@ -160,24 +159,28 @@ def _parse(self): raise CorruptElfFile(f"invalid ei_class: 0x{ei_class:02x}") if ei_data == 1: - self.endian = "<" + self.endian = "little" elif ei_data == 2: - self.endian = ">" + self.endian = "big" else: raise CorruptElfFile(f"not an ELF file: invalid ei_data: 0x{ei_data:02x}") if self.bitness == 32: - e_phoff, e_shoff = struct.unpack_from(self.endian + "II", self.file_header, 0x1C) - self.e_phentsize, self.e_phnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x2A) - self.e_shentsize, self.e_shnum, self.e_shstrndx = struct.unpack_from( - self.endian + "HHH", self.file_header, 0x2E - ) + e_phoff = int.from_bytes(self.file_header[0x1C:0x20], byteorder=self.endian, signed=False) + e_shoff = int.from_bytes(self.file_header[0x20:0x24], byteorder=self.endian, signed=False) + self.e_phentsize = int.from_bytes(self.file_header[0x2A:0x2C], byteorder=self.endian, signed=False) + self.e_phnum = int.from_bytes(self.file_header[0x2C:0x2E], byteorder=self.endian, signed=False) + self.e_shentsize = int.from_bytes(self.file_header[0x2E:0x30], byteorder=self.endian, signed=False) + self.e_shnum = int.from_bytes(self.file_header[0x30:0x32], byteorder=self.endian, signed=False) + self.e_shstrndx = int.from_bytes(self.file_header[0x32:0x34], byteorder=self.endian, signed=False) elif self.bitness == 64: - e_phoff, e_shoff = struct.unpack_from(self.endian + "QQ", self.file_header, 0x20) - self.e_phentsize, self.e_phnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x36) - self.e_shentsize, self.e_shnum, self.e_shstrndx = struct.unpack_from( - self.endian + "HHH", self.file_header, 0x3A - ) + e_phoff = int.from_bytes(self.file_header[0x20:0x28], byteorder=self.endian, signed=False) + e_shoff = int.from_bytes(self.file_header[0x28:0x30], byteorder=self.endian, signed=False) + self.e_phentsize = int.from_bytes(self.file_header[0x36:0x38], byteorder=self.endian, signed=False) + self.e_phnum = int.from_bytes(self.file_header[0x38:0x3A], byteorder=self.endian, signed=False) + self.e_shentsize = int.from_bytes(self.file_header[0x3A:0x3C], byteorder=self.endian, signed=False) + self.e_shnum = int.from_bytes(self.file_header[0x3C:0x3E], byteorder=self.endian, signed=False) + self.e_shstrndx = int.from_bytes(self.file_header[0x3E:0x40], byteorder=self.endian, signed=False) else: raise NotImplementedError() @@ -227,7 +230,7 @@ def _parse(self): @property def ei_osabi(self) -> Optional[OS]: - (ei_osabi,) = struct.unpack_from(self.endian + "B", self.file_header, 7) + ei_osabi = int.from_bytes(self.file_header[7:8], byteorder=self.endian, signed=False) return ELF.OSABI.get(ei_osabi) MACHINE = { @@ -324,7 +327,7 @@ def ei_osabi(self) -> Optional[OS]: @property def e_machine(self) -> Optional[str]: - (e_machine,) = struct.unpack_from(self.endian + "H", self.file_header, 0x12) + (e_machine,) = (int.from_bytes(self.file_header[0x12:0x14], byteorder=self.endian, signed=False),) return ELF.MACHINE.get(e_machine) def parse_program_header(self, i) -> Phdr: @@ -332,13 +335,21 @@ def parse_program_header(self, i) -> Phdr: phent = self.phbuf[phent_offset : phent_offset + self.e_phentsize] if self.bitness == 32: - p_type, p_offset, p_vaddr, p_paddr, p_filesz, p_memsz, p_flags = struct.unpack_from( - self.endian + "IIIIIII", phent, 0x0 - ) + p_type = int.from_bytes(phent[0:4], byteorder=self.endian, signed=False) + p_offset = int.from_bytes(phent[4:8], byteorder=self.endian, signed=False) + p_vaddr = int.from_bytes(phent[8:12], byteorder=self.endian, signed=False) + p_paddr = int.from_bytes(phent[12:16], byteorder=self.endian, signed=False) + p_filesz = int.from_bytes(phent[16:20], byteorder=self.endian, signed=False) + p_memsz = int.from_bytes(phent[20:24], byteorder=self.endian, signed=False) + p_flags = int.from_bytes(phent[24:28], byteorder=self.endian, signed=False) elif self.bitness == 64: - p_type, p_flags, p_offset, p_vaddr, p_paddr, p_filesz, p_memsz = struct.unpack_from( - self.endian + "IIQQQQQ", phent, 0x0 - ) + p_type = int.from_bytes(phent[0:4], byteorder=self.endian, signed=False) + p_flags = int.from_bytes(phent[4:8], byteorder=self.endian, signed=False) + p_offset = int.from_bytes(phent[8:16], byteorder=self.endian, signed=False) + p_vaddr = int.from_bytes(phent[16:24], byteorder=self.endian, signed=False) + p_paddr = int.from_bytes(phent[24:32], byteorder=self.endian, signed=False) + p_filesz = int.from_bytes(phent[32:40], byteorder=self.endian, signed=False) + p_memsz = int.from_bytes(phent[40:48], byteorder=self.endian, signed=False) else: raise NotImplementedError() @@ -362,13 +373,23 @@ def parse_section_header(self, i) -> Shdr: shent = self.shbuf[shent_offset : shent_offset + self.e_shentsize] if self.bitness == 32: - sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link, _, _, sh_entsize = struct.unpack_from( - self.endian + "IIIIIIIIII", shent, 0x0 - ) + sh_name = int.from_bytes(shent[0:4], byteorder=self.endian, signed=False) + sh_type = int.from_bytes(shent[4:8], byteorder=self.endian, signed=False) + sh_flags = int.from_bytes(shent[8:12], byteorder=self.endian, signed=False) + sh_addr = int.from_bytes(shent[12:16], byteorder=self.endian, signed=False) + sh_offset = int.from_bytes(shent[16:20], byteorder=self.endian, signed=False) + sh_size = int.from_bytes(shent[20:24], byteorder=self.endian, signed=False) + sh_link = int.from_bytes(shent[24:28], byteorder=self.endian, signed=False) + sh_entsize = int.from_bytes(shent[36:40], byteorder=self.endian, signed=False) elif self.bitness == 64: - sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link, _, _, sh_entsize = struct.unpack_from( - self.endian + "IIQQQQIIQQ", shent, 0x0 - ) + sh_name = int.from_bytes(shent[0:4], byteorder=self.endian, signed=False) + sh_type = int.from_bytes(shent[4:8], byteorder=self.endian, signed=False) + sh_flags = int.from_bytes(shent[8:16], byteorder=self.endian, signed=False) + sh_addr = int.from_bytes(shent[16:24], byteorder=self.endian, signed=False) + sh_offset = int.from_bytes(shent[24:32], byteorder=self.endian, signed=False) + sh_size = int.from_bytes(shent[32:40], byteorder=self.endian, signed=False) + sh_link = int.from_bytes(shent[40:44], byteorder=self.endian, signed=False) + sh_entsize = int.from_bytes(shent[56:64], byteorder=self.endian, signed=False) else: raise NotImplementedError() @@ -426,9 +447,11 @@ def versions_needed(self) -> dict[str, set[str]]: vn_offset = 0x0 while True: # ElfXX_Verneed layout is the same on 32 and 64 bit - vn_version, vn_cnt, vn_file, vn_aux, vn_next = struct.unpack_from( - self.endian + "HHIII", shdr.buf, vn_offset - ) + vn_version = int.from_bytes(shdr.buf[vn_offset : vn_offset + 2], byteorder=self.endian, signed=False) + vn_cnt = int.from_bytes(shdr.buf[vn_offset + 2 : vn_offset + 4], byteorder=self.endian, signed=False) + vn_file = int.from_bytes(shdr.buf[vn_offset + 4 : vn_offset + 8], byteorder=self.endian, signed=False) + vn_aux = int.from_bytes(shdr.buf[vn_offset + 8 : vn_offset + 12], byteorder=self.endian, signed=False) + vn_next = int.from_bytes(shdr.buf[vn_offset + 12 : vn_offset + 16], byteorder=self.endian, signed=False) if vn_version != 1: # unexpected format, don't try to keep parsing break @@ -442,7 +465,12 @@ def versions_needed(self) -> dict[str, set[str]]: vna_offset = vn_offset + vn_aux for _ in range(vn_cnt): # ElfXX_Vernaux layout is the same on 32 and 64 bit - _, _, _, vna_name, vna_next = struct.unpack_from(self.endian + "IHHII", shdr.buf, vna_offset) + vna_name = int.from_bytes( + shdr.buf[vna_offset + 8 : vna_offset + 12], byteorder=self.endian, signed=False + ) + vna_next = int.from_bytes( + shdr.buf[vna_offset + 12 : vna_offset + 16], byteorder=self.endian, signed=False + ) # ABI names, like: "GLIBC_2.2.5" abi = read_cstr(linked_shdr.buf, vna_name) @@ -473,10 +501,12 @@ def dynamic_entries(self) -> Iterator[tuple[int, int]]: offset = 0x0 while True: if self.bitness == 32: - d_tag, d_val = struct.unpack_from(self.endian + "II", phdr.buf, offset) + d_tag = int.from_bytes(phdr.buf[offset : offset + 4], byteorder=self.endian, signed=False) + d_val = int.from_bytes(phdr.buf[offset + 4 : offset + 8], byteorder=self.endian, signed=False) offset += 8 elif self.bitness == 64: - d_tag, d_val = struct.unpack_from(self.endian + "QQ", phdr.buf, offset) + d_tag = int.from_bytes(phdr.buf[offset : offset + 8], byteorder=self.endian, signed=False) + d_val = int.from_bytes(phdr.buf[offset + 8 : offset + 16], byteorder=self.endian, signed=False) offset += 16 else: raise NotImplementedError() @@ -580,7 +610,7 @@ class ABITag: class PHNote: - def __init__(self, endian: str, buf: bytes): + def __init__(self, endian: Literal["big", "little"], buf: bytes): self.endian = endian self.buf = buf @@ -592,7 +622,9 @@ def __init__(self, endian: str, buf: bytes): self._parse() def _parse(self): - namesz, self.descsz, self.type_ = struct.unpack_from(self.endian + "III", self.buf, 0x0) + namesz = int.from_bytes(self.buf[0x0:0x4], byteorder=self.endian, signed=False) + self.descsz = int.from_bytes(self.buf[0x4:0x8], byteorder=self.endian, signed=False) + self.type_ = int.from_bytes(self.buf[0x8:0xC], byteorder=self.endian, signed=False) name_offset = 0xC self.desc_offset = name_offset + align(namesz, 0x4) @@ -616,7 +648,10 @@ def abi_tag(self) -> Optional[ABITag]: return None desc = self.buf[self.desc_offset : self.desc_offset + self.descsz] - abi_tag, kmajor, kminor, kpatch = struct.unpack_from(self.endian + "IIII", desc, 0x0) + abi_tag = int.from_bytes(desc[0:4], byteorder=self.endian, signed=False) + kmajor = int.from_bytes(desc[4:8], byteorder=self.endian, signed=False) + kminor = int.from_bytes(desc[8:12], byteorder=self.endian, signed=False) + kpatch = int.from_bytes(desc[12:16], byteorder=self.endian, signed=False) logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) os = GNU_ABI_TAG.get(abi_tag) @@ -629,7 +664,7 @@ def abi_tag(self) -> Optional[ABITag]: class SHNote: - def __init__(self, endian: str, buf: bytes): + def __init__(self, endian: Literal["big", "little"], buf: bytes): self.endian = endian self.buf = buf @@ -641,7 +676,9 @@ def __init__(self, endian: str, buf: bytes): self._parse() def _parse(self): - namesz, self.descsz, self.type_ = struct.unpack_from(self.endian + "III", self.buf, 0x0) + namesz = int.from_bytes(self.buf[0x0:0x4], byteorder=self.endian, signed=False) + self.descsz = int.from_bytes(self.buf[0x4:0x8], byteorder=self.endian, signed=False) + self.type_ = int.from_bytes(self.buf[0x8:0xC], byteorder=self.endian, signed=False) name_offset = 0xC self.desc_offset = name_offset + align(namesz, 0x4) @@ -660,7 +697,10 @@ def abi_tag(self) -> Optional[ABITag]: return None desc = self.buf[self.desc_offset : self.desc_offset + self.descsz] - abi_tag, kmajor, kminor, kpatch = struct.unpack_from(self.endian + "IIII", desc, 0x0) + abi_tag = int.from_bytes(desc[0:4], byteorder=self.endian, signed=False) + kmajor = int.from_bytes(desc[4:8], byteorder=self.endian, signed=False) + kminor = int.from_bytes(desc[8:12], byteorder=self.endian, signed=False) + kpatch = int.from_bytes(desc[12:16], byteorder=self.endian, signed=False) logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) os = GNU_ABI_TAG.get(abi_tag) @@ -684,7 +724,7 @@ class Symbol: class SymTab: def __init__( self, - endian: str, + endian: Literal["big", "little"], bitness: int, symtab: Shdr, strtab: Shdr, @@ -696,7 +736,7 @@ def __init__( self._parse(endian, bitness, symtab.buf) - def _parse(self, endian: str, bitness: int, symtab_buf: bytes) -> None: + def _parse(self, endian: Literal["big", "little"], bitness: int, symtab_buf: bytes) -> None: """ return the symbol's information in the order specified by sys/elf32.h @@ -706,12 +746,62 @@ def _parse(self, endian: str, bitness: int, symtab_buf: bytes) -> None: for i in range(int(len(self.symtab.buf) / self.symtab.entsize)): if bitness == 32: - name_offset, value, size, info, other, shndx = struct.unpack_from( - endian + "IIIBBH", symtab_buf, i * self.symtab.entsize + name_offset = int.from_bytes( + symtab_buf[i * self.symtab.entsize : i * self.symtab.entsize + 4], byteorder=endian, signed=False + ) + value = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 4 : i * self.symtab.entsize + 8], + byteorder=endian, + signed=False, + ) + size = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 8 : i * self.symtab.entsize + 12], + byteorder=endian, + signed=False, + ) + info = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 12 : i * self.symtab.entsize + 13], + byteorder=endian, + signed=False, + ) + other = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 13 : i * self.symtab.entsize + 14], + byteorder=endian, + signed=False, + ) + shndx = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 14 : i * self.symtab.entsize + 16], + byteorder=endian, + signed=False, ) elif bitness == 64: - name_offset, info, other, shndx, value, size = struct.unpack_from( - endian + "IBBHQQ", symtab_buf, i * self.symtab.entsize + name_offset = int.from_bytes( + symtab_buf[i * self.symtab.entsize : i * self.symtab.entsize + 4], byteorder=endian, signed=False + ) + info = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 4 : i * self.symtab.entsize + 5], + byteorder=endian, + signed=False, + ) + other = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 5 : i * self.symtab.entsize + 6], + byteorder=endian, + signed=False, + ) + shndx = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 6 : i * self.symtab.entsize + 8], + byteorder=endian, + signed=False, + ) + value = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 8 : i * self.symtab.entsize + 16], + byteorder=endian, + signed=False, + ) + size = int.from_bytes( + symtab_buf[i * self.symtab.entsize + 16 : i * self.symtab.entsize + 24], + byteorder=endian, + signed=False, ) self.symbols.append(Symbol(name_offset, value, size, info, other, shndx)) @@ -739,7 +829,7 @@ def get_symbols(self) -> Iterator[Symbol]: @classmethod def from_viv(cls, elf: "Elf.Elf") -> Optional["SymTab"]: - endian = "<" if elf.getEndian() == 0 else ">" + endian: Literal["big", "little"] = "little" if elf.getEndian() == 0 else "big" bitness = elf.bits SHT_SYMTAB = 0x2 @@ -1034,12 +1124,13 @@ def read_data(elf: ELF, rva: int, size: int) -> Optional[bytes]: def read_go_slice(elf: ELF, rva: int) -> Optional[bytes]: + psize: int = 0 if elf.bitness == 32: struct_size = 8 - struct_format = elf.endian + "II" + psize = 4 elif elf.bitness == 64: struct_size = 16 - struct_format = elf.endian + "QQ" + psize = 8 else: raise ValueError("invalid psize") @@ -1047,7 +1138,8 @@ def read_go_slice(elf: ELF, rva: int) -> Optional[bytes]: if not struct_buf: return None - addr, length = struct.unpack_from(struct_format, struct_buf, 0) + addr = int.from_bytes(struct_buf[0:psize], byteorder=elf.endian, signed=False) + length = int.from_bytes(struct_buf[psize : psize * 2], byteorder=elf.endian, signed=False) return read_data(elf, addr, length) @@ -1096,7 +1188,12 @@ def guess_os_from_go_buildinfo(elf: ELF) -> Optional[OS]: logger.debug("go buildinfo: no buildinfo magic") return None - psize, flags = struct.unpack_from(" Optional[OS]: # This is the uncommon path. Most samples will have an inline GOOS string. # # To find samples on VT, use the referenced VTGrep content searches. - info_format = { - # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 00} - # like: 71e617e5cc7fda89bf67422ff60f437e9d54622382c5ed6ff31f75e601f9b22e - # in which the modinfo doesn't have GOOS. - (4, False): "II", - # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 01} - # like: d44ba497964050c0e3dd2a192c511e4c3c4f17717f0322a554d64b797ee4690a - # in which the modinfo doesn't have GOOS. - (8, True): ">QQ", - } - - build_version_address, modinfo_address = struct.unpack_from( - info_format[(psize, is_big_endian)], buf, index + 0x10 - ) + # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 00} + # like: 71e617e5cc7fda89bf67422ff60f437e9d54622382c5ed6ff31f75e601f9b22e + # in which the modinfo doesn't have GOOS. + # 4 byte size and little endian + # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 00} + # like: 93d3b3e2a904c6c909e20f2f76c3c2e8d0c81d535eb46e5493b5701f461816c3 + # in which the modinfo doesn't have GOOS. + # 8 byte size and little endian + # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 01} + # (no matches on VT today) + # 4 byte size and little endian + # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 01} + # like: d44ba497964050c0e3dd2a192c511e4c3c4f17717f0322a554d64b797ee4690a + # in which the modinfo doesn't have GOOS. + # 8 byte size and big endian + + endian: Literal["big", "little"] = "big" if is_big_endian else "little" + if psize == 4: + build_version_address = int.from_bytes(buf[index + 0x10 : index + 0x14], byteorder=endian, signed=False) + modinfo_address = int.from_bytes(buf[index + 0x14 : index + 0x18], byteorder=endian, signed=False) + else: # psize == 8 + build_version_address = int.from_bytes(buf[index + 0x10 : index + 0x18], byteorder=endian, signed=False) + modinfo_address = int.from_bytes(buf[index + 0x18 : index + 0x20], byteorder=endian, signed=False) logger.debug("go buildinfo: build version address: 0x%x", build_version_address) logger.debug("go buildinfo: modinfo address: 0x%x", modinfo_address) diff --git a/capa/features/extractors/ghidra/basicblock.py b/capa/features/extractors/ghidra/basicblock.py index 25b73ee43..904c20e2e 100644 --- a/capa/features/extractors/ghidra/basicblock.py +++ b/capa/features/extractors/ghidra/basicblock.py @@ -14,7 +14,6 @@ import string -import struct from typing import Iterator import ghidra @@ -35,13 +34,13 @@ def get_printable_len(op: ghidra.program.model.scalar.Scalar) -> int: op_val = op.getValue() if op_bit_len == 8: - chars = struct.pack(" MAX_OFFSET_PE_AFTER_MZ: diff --git a/capa/features/extractors/helpers.py b/capa/features/extractors/helpers.py index eb546f504..f764a64ad 100644 --- a/capa/features/extractors/helpers.py +++ b/capa/features/extractors/helpers.py @@ -13,7 +13,6 @@ # limitations under the License. -import struct import builtins from typing import Iterator @@ -157,7 +156,7 @@ def carve_pe(pbytes: bytes, offset: int = 0) -> Iterator[tuple[int, int]]: if pblen < (e_lfanew + 4): continue - newoff = struct.unpack(" int: op_val = capa.features.extractors.ida.helpers.mask_op_val(op) if op.dtype == idaapi.dt_byte: - chars = struct.pack(" Iterator[tuple[int, int]]: if seg_max < (e_lfanew + 4): continue - newoff = struct.unpack(" MAX_OFFSET_PE_AFTER_MZ: diff --git a/capa/features/extractors/viv/basicblock.py b/capa/features/extractors/viv/basicblock.py index 0f95bdef1..65c1f7d0d 100644 --- a/capa/features/extractors/viv/basicblock.py +++ b/capa/features/extractors/viv/basicblock.py @@ -14,7 +14,6 @@ import string -import struct from typing import Iterator import envi @@ -119,13 +118,13 @@ def get_printable_len(oper: envi.archs.i386.disasm.i386ImmOper) -> int: Return string length if all operand bytes are ascii or utf16-le printable """ if oper.tsize == 1: - chars = struct.pack(" Date: Fri, 31 Jan 2025 00:03:18 +0530 Subject: [PATCH 3/5] simplify byte extraction for ELF header Signed-off-by: vibhatsu --- capa/features/extractors/elf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py index 15e655b23..e1cb4bf30 100644 --- a/capa/features/extractors/elf.py +++ b/capa/features/extractors/elf.py @@ -149,7 +149,7 @@ def _parse(self): if not self.file_header.startswith(b"\x7fELF"): raise CorruptElfFile("missing magic header") - ei_class, ei_data = int.from_bytes(self.file_header[4:5]), int.from_bytes(self.file_header[5:6]) + ei_class, ei_data = self.file_header[4], self.file_header[5] logger.debug("ei_class: 0x%02x ei_data: 0x%02x", ei_class, ei_data) if ei_class == 1: self.bitness = 32 From 84580ad51a5b2bd0f660d1a0234062e67370826a Mon Sep 17 00:00:00 2001 From: vibhatsu Date: Fri, 31 Jan 2025 13:03:54 +0530 Subject: [PATCH 4/5] Revert "refactor: replace struct unpacking with bytes conversion" This reverts commit 483f8c9a85879d0d2eb0ac34fd5a5c634e4836f8. --- capa/features/extractors/common.py | 5 +- capa/features/extractors/elf.py | 247 ++++++------------ capa/features/extractors/ghidra/basicblock.py | 9 +- capa/features/extractors/ghidra/file.py | 3 +- capa/features/extractors/helpers.py | 3 +- capa/features/extractors/ida/basicblock.py | 9 +- capa/features/extractors/ida/file.py | 3 +- capa/features/extractors/viv/basicblock.py | 9 +- 8 files changed, 98 insertions(+), 190 deletions(-) diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py index 9f25243d9..f8918b8d8 100644 --- a/capa/features/extractors/common.py +++ b/capa/features/extractors/common.py @@ -15,6 +15,7 @@ import io import re import logging +import binascii import contextlib from typing import Iterator @@ -113,7 +114,7 @@ def extract_arch(buf) -> Iterator[tuple[Feature, Address]]: # rules that rely on arch conditions will fail to match on shellcode. # # for (2), this logic will need to be updated as the format is implemented. - logger.debug("unsupported file format: %s, will not guess Arch", bytes.hex(buf[:4])) + logger.debug("unsupported file format: %s, will not guess Arch", binascii.hexlify(buf[:4]).decode("ascii")) return @@ -144,5 +145,5 @@ def extract_os(buf, os=OS_AUTO) -> Iterator[tuple[Feature, Address]]: # rules that rely on OS conditions will fail to match on shellcode. # # for (2), this logic will need to be updated as the format is implemented. - logger.debug("unsupported file format: %s, will not guess OS", bytes.hex(buf[:4])) + logger.debug("unsupported file format: %s, will not guess OS", binascii.hexlify(buf[:4]).decode("ascii")) return diff --git a/capa/features/extractors/elf.py b/capa/features/extractors/elf.py index e1cb4bf30..a3d52082e 100644 --- a/capa/features/extractors/elf.py +++ b/capa/features/extractors/elf.py @@ -12,11 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +import struct import logging import itertools import collections from enum import Enum -from typing import TYPE_CHECKING, Literal, BinaryIO, Iterator, Optional +from typing import TYPE_CHECKING, BinaryIO, Iterator, Optional from dataclasses import dataclass if TYPE_CHECKING: @@ -131,7 +132,7 @@ def __init__(self, f: BinaryIO): # these will all be initialized in `_parse()` self.bitness: int - self.endian: Literal["little", "big"] + self.endian: str self.e_phentsize: int self.e_phnum: int self.e_shentsize: int @@ -149,7 +150,7 @@ def _parse(self): if not self.file_header.startswith(b"\x7fELF"): raise CorruptElfFile("missing magic header") - ei_class, ei_data = self.file_header[4], self.file_header[5] + ei_class, ei_data = struct.unpack_from("BB", self.file_header, 4) logger.debug("ei_class: 0x%02x ei_data: 0x%02x", ei_class, ei_data) if ei_class == 1: self.bitness = 32 @@ -159,28 +160,24 @@ def _parse(self): raise CorruptElfFile(f"invalid ei_class: 0x{ei_class:02x}") if ei_data == 1: - self.endian = "little" + self.endian = "<" elif ei_data == 2: - self.endian = "big" + self.endian = ">" else: raise CorruptElfFile(f"not an ELF file: invalid ei_data: 0x{ei_data:02x}") if self.bitness == 32: - e_phoff = int.from_bytes(self.file_header[0x1C:0x20], byteorder=self.endian, signed=False) - e_shoff = int.from_bytes(self.file_header[0x20:0x24], byteorder=self.endian, signed=False) - self.e_phentsize = int.from_bytes(self.file_header[0x2A:0x2C], byteorder=self.endian, signed=False) - self.e_phnum = int.from_bytes(self.file_header[0x2C:0x2E], byteorder=self.endian, signed=False) - self.e_shentsize = int.from_bytes(self.file_header[0x2E:0x30], byteorder=self.endian, signed=False) - self.e_shnum = int.from_bytes(self.file_header[0x30:0x32], byteorder=self.endian, signed=False) - self.e_shstrndx = int.from_bytes(self.file_header[0x32:0x34], byteorder=self.endian, signed=False) + e_phoff, e_shoff = struct.unpack_from(self.endian + "II", self.file_header, 0x1C) + self.e_phentsize, self.e_phnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x2A) + self.e_shentsize, self.e_shnum, self.e_shstrndx = struct.unpack_from( + self.endian + "HHH", self.file_header, 0x2E + ) elif self.bitness == 64: - e_phoff = int.from_bytes(self.file_header[0x20:0x28], byteorder=self.endian, signed=False) - e_shoff = int.from_bytes(self.file_header[0x28:0x30], byteorder=self.endian, signed=False) - self.e_phentsize = int.from_bytes(self.file_header[0x36:0x38], byteorder=self.endian, signed=False) - self.e_phnum = int.from_bytes(self.file_header[0x38:0x3A], byteorder=self.endian, signed=False) - self.e_shentsize = int.from_bytes(self.file_header[0x3A:0x3C], byteorder=self.endian, signed=False) - self.e_shnum = int.from_bytes(self.file_header[0x3C:0x3E], byteorder=self.endian, signed=False) - self.e_shstrndx = int.from_bytes(self.file_header[0x3E:0x40], byteorder=self.endian, signed=False) + e_phoff, e_shoff = struct.unpack_from(self.endian + "QQ", self.file_header, 0x20) + self.e_phentsize, self.e_phnum = struct.unpack_from(self.endian + "HH", self.file_header, 0x36) + self.e_shentsize, self.e_shnum, self.e_shstrndx = struct.unpack_from( + self.endian + "HHH", self.file_header, 0x3A + ) else: raise NotImplementedError() @@ -230,7 +227,7 @@ def _parse(self): @property def ei_osabi(self) -> Optional[OS]: - ei_osabi = int.from_bytes(self.file_header[7:8], byteorder=self.endian, signed=False) + (ei_osabi,) = struct.unpack_from(self.endian + "B", self.file_header, 7) return ELF.OSABI.get(ei_osabi) MACHINE = { @@ -327,7 +324,7 @@ def ei_osabi(self) -> Optional[OS]: @property def e_machine(self) -> Optional[str]: - (e_machine,) = (int.from_bytes(self.file_header[0x12:0x14], byteorder=self.endian, signed=False),) + (e_machine,) = struct.unpack_from(self.endian + "H", self.file_header, 0x12) return ELF.MACHINE.get(e_machine) def parse_program_header(self, i) -> Phdr: @@ -335,21 +332,13 @@ def parse_program_header(self, i) -> Phdr: phent = self.phbuf[phent_offset : phent_offset + self.e_phentsize] if self.bitness == 32: - p_type = int.from_bytes(phent[0:4], byteorder=self.endian, signed=False) - p_offset = int.from_bytes(phent[4:8], byteorder=self.endian, signed=False) - p_vaddr = int.from_bytes(phent[8:12], byteorder=self.endian, signed=False) - p_paddr = int.from_bytes(phent[12:16], byteorder=self.endian, signed=False) - p_filesz = int.from_bytes(phent[16:20], byteorder=self.endian, signed=False) - p_memsz = int.from_bytes(phent[20:24], byteorder=self.endian, signed=False) - p_flags = int.from_bytes(phent[24:28], byteorder=self.endian, signed=False) + p_type, p_offset, p_vaddr, p_paddr, p_filesz, p_memsz, p_flags = struct.unpack_from( + self.endian + "IIIIIII", phent, 0x0 + ) elif self.bitness == 64: - p_type = int.from_bytes(phent[0:4], byteorder=self.endian, signed=False) - p_flags = int.from_bytes(phent[4:8], byteorder=self.endian, signed=False) - p_offset = int.from_bytes(phent[8:16], byteorder=self.endian, signed=False) - p_vaddr = int.from_bytes(phent[16:24], byteorder=self.endian, signed=False) - p_paddr = int.from_bytes(phent[24:32], byteorder=self.endian, signed=False) - p_filesz = int.from_bytes(phent[32:40], byteorder=self.endian, signed=False) - p_memsz = int.from_bytes(phent[40:48], byteorder=self.endian, signed=False) + p_type, p_flags, p_offset, p_vaddr, p_paddr, p_filesz, p_memsz = struct.unpack_from( + self.endian + "IIQQQQQ", phent, 0x0 + ) else: raise NotImplementedError() @@ -373,23 +362,13 @@ def parse_section_header(self, i) -> Shdr: shent = self.shbuf[shent_offset : shent_offset + self.e_shentsize] if self.bitness == 32: - sh_name = int.from_bytes(shent[0:4], byteorder=self.endian, signed=False) - sh_type = int.from_bytes(shent[4:8], byteorder=self.endian, signed=False) - sh_flags = int.from_bytes(shent[8:12], byteorder=self.endian, signed=False) - sh_addr = int.from_bytes(shent[12:16], byteorder=self.endian, signed=False) - sh_offset = int.from_bytes(shent[16:20], byteorder=self.endian, signed=False) - sh_size = int.from_bytes(shent[20:24], byteorder=self.endian, signed=False) - sh_link = int.from_bytes(shent[24:28], byteorder=self.endian, signed=False) - sh_entsize = int.from_bytes(shent[36:40], byteorder=self.endian, signed=False) + sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link, _, _, sh_entsize = struct.unpack_from( + self.endian + "IIIIIIIIII", shent, 0x0 + ) elif self.bitness == 64: - sh_name = int.from_bytes(shent[0:4], byteorder=self.endian, signed=False) - sh_type = int.from_bytes(shent[4:8], byteorder=self.endian, signed=False) - sh_flags = int.from_bytes(shent[8:16], byteorder=self.endian, signed=False) - sh_addr = int.from_bytes(shent[16:24], byteorder=self.endian, signed=False) - sh_offset = int.from_bytes(shent[24:32], byteorder=self.endian, signed=False) - sh_size = int.from_bytes(shent[32:40], byteorder=self.endian, signed=False) - sh_link = int.from_bytes(shent[40:44], byteorder=self.endian, signed=False) - sh_entsize = int.from_bytes(shent[56:64], byteorder=self.endian, signed=False) + sh_name, sh_type, sh_flags, sh_addr, sh_offset, sh_size, sh_link, _, _, sh_entsize = struct.unpack_from( + self.endian + "IIQQQQIIQQ", shent, 0x0 + ) else: raise NotImplementedError() @@ -447,11 +426,9 @@ def versions_needed(self) -> dict[str, set[str]]: vn_offset = 0x0 while True: # ElfXX_Verneed layout is the same on 32 and 64 bit - vn_version = int.from_bytes(shdr.buf[vn_offset : vn_offset + 2], byteorder=self.endian, signed=False) - vn_cnt = int.from_bytes(shdr.buf[vn_offset + 2 : vn_offset + 4], byteorder=self.endian, signed=False) - vn_file = int.from_bytes(shdr.buf[vn_offset + 4 : vn_offset + 8], byteorder=self.endian, signed=False) - vn_aux = int.from_bytes(shdr.buf[vn_offset + 8 : vn_offset + 12], byteorder=self.endian, signed=False) - vn_next = int.from_bytes(shdr.buf[vn_offset + 12 : vn_offset + 16], byteorder=self.endian, signed=False) + vn_version, vn_cnt, vn_file, vn_aux, vn_next = struct.unpack_from( + self.endian + "HHIII", shdr.buf, vn_offset + ) if vn_version != 1: # unexpected format, don't try to keep parsing break @@ -465,12 +442,7 @@ def versions_needed(self) -> dict[str, set[str]]: vna_offset = vn_offset + vn_aux for _ in range(vn_cnt): # ElfXX_Vernaux layout is the same on 32 and 64 bit - vna_name = int.from_bytes( - shdr.buf[vna_offset + 8 : vna_offset + 12], byteorder=self.endian, signed=False - ) - vna_next = int.from_bytes( - shdr.buf[vna_offset + 12 : vna_offset + 16], byteorder=self.endian, signed=False - ) + _, _, _, vna_name, vna_next = struct.unpack_from(self.endian + "IHHII", shdr.buf, vna_offset) # ABI names, like: "GLIBC_2.2.5" abi = read_cstr(linked_shdr.buf, vna_name) @@ -501,12 +473,10 @@ def dynamic_entries(self) -> Iterator[tuple[int, int]]: offset = 0x0 while True: if self.bitness == 32: - d_tag = int.from_bytes(phdr.buf[offset : offset + 4], byteorder=self.endian, signed=False) - d_val = int.from_bytes(phdr.buf[offset + 4 : offset + 8], byteorder=self.endian, signed=False) + d_tag, d_val = struct.unpack_from(self.endian + "II", phdr.buf, offset) offset += 8 elif self.bitness == 64: - d_tag = int.from_bytes(phdr.buf[offset : offset + 8], byteorder=self.endian, signed=False) - d_val = int.from_bytes(phdr.buf[offset + 8 : offset + 16], byteorder=self.endian, signed=False) + d_tag, d_val = struct.unpack_from(self.endian + "QQ", phdr.buf, offset) offset += 16 else: raise NotImplementedError() @@ -610,7 +580,7 @@ class ABITag: class PHNote: - def __init__(self, endian: Literal["big", "little"], buf: bytes): + def __init__(self, endian: str, buf: bytes): self.endian = endian self.buf = buf @@ -622,9 +592,7 @@ def __init__(self, endian: Literal["big", "little"], buf: bytes): self._parse() def _parse(self): - namesz = int.from_bytes(self.buf[0x0:0x4], byteorder=self.endian, signed=False) - self.descsz = int.from_bytes(self.buf[0x4:0x8], byteorder=self.endian, signed=False) - self.type_ = int.from_bytes(self.buf[0x8:0xC], byteorder=self.endian, signed=False) + namesz, self.descsz, self.type_ = struct.unpack_from(self.endian + "III", self.buf, 0x0) name_offset = 0xC self.desc_offset = name_offset + align(namesz, 0x4) @@ -648,10 +616,7 @@ def abi_tag(self) -> Optional[ABITag]: return None desc = self.buf[self.desc_offset : self.desc_offset + self.descsz] - abi_tag = int.from_bytes(desc[0:4], byteorder=self.endian, signed=False) - kmajor = int.from_bytes(desc[4:8], byteorder=self.endian, signed=False) - kminor = int.from_bytes(desc[8:12], byteorder=self.endian, signed=False) - kpatch = int.from_bytes(desc[12:16], byteorder=self.endian, signed=False) + abi_tag, kmajor, kminor, kpatch = struct.unpack_from(self.endian + "IIII", desc, 0x0) logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) os = GNU_ABI_TAG.get(abi_tag) @@ -664,7 +629,7 @@ def abi_tag(self) -> Optional[ABITag]: class SHNote: - def __init__(self, endian: Literal["big", "little"], buf: bytes): + def __init__(self, endian: str, buf: bytes): self.endian = endian self.buf = buf @@ -676,9 +641,7 @@ def __init__(self, endian: Literal["big", "little"], buf: bytes): self._parse() def _parse(self): - namesz = int.from_bytes(self.buf[0x0:0x4], byteorder=self.endian, signed=False) - self.descsz = int.from_bytes(self.buf[0x4:0x8], byteorder=self.endian, signed=False) - self.type_ = int.from_bytes(self.buf[0x8:0xC], byteorder=self.endian, signed=False) + namesz, self.descsz, self.type_ = struct.unpack_from(self.endian + "III", self.buf, 0x0) name_offset = 0xC self.desc_offset = name_offset + align(namesz, 0x4) @@ -697,10 +660,7 @@ def abi_tag(self) -> Optional[ABITag]: return None desc = self.buf[self.desc_offset : self.desc_offset + self.descsz] - abi_tag = int.from_bytes(desc[0:4], byteorder=self.endian, signed=False) - kmajor = int.from_bytes(desc[4:8], byteorder=self.endian, signed=False) - kminor = int.from_bytes(desc[8:12], byteorder=self.endian, signed=False) - kpatch = int.from_bytes(desc[12:16], byteorder=self.endian, signed=False) + abi_tag, kmajor, kminor, kpatch = struct.unpack_from(self.endian + "IIII", desc, 0x0) logger.debug("GNU_ABI_TAG: 0x%02x", abi_tag) os = GNU_ABI_TAG.get(abi_tag) @@ -724,7 +684,7 @@ class Symbol: class SymTab: def __init__( self, - endian: Literal["big", "little"], + endian: str, bitness: int, symtab: Shdr, strtab: Shdr, @@ -736,7 +696,7 @@ def __init__( self._parse(endian, bitness, symtab.buf) - def _parse(self, endian: Literal["big", "little"], bitness: int, symtab_buf: bytes) -> None: + def _parse(self, endian: str, bitness: int, symtab_buf: bytes) -> None: """ return the symbol's information in the order specified by sys/elf32.h @@ -746,62 +706,12 @@ def _parse(self, endian: Literal["big", "little"], bitness: int, symtab_buf: byt for i in range(int(len(self.symtab.buf) / self.symtab.entsize)): if bitness == 32: - name_offset = int.from_bytes( - symtab_buf[i * self.symtab.entsize : i * self.symtab.entsize + 4], byteorder=endian, signed=False - ) - value = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 4 : i * self.symtab.entsize + 8], - byteorder=endian, - signed=False, - ) - size = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 8 : i * self.symtab.entsize + 12], - byteorder=endian, - signed=False, - ) - info = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 12 : i * self.symtab.entsize + 13], - byteorder=endian, - signed=False, - ) - other = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 13 : i * self.symtab.entsize + 14], - byteorder=endian, - signed=False, - ) - shndx = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 14 : i * self.symtab.entsize + 16], - byteorder=endian, - signed=False, + name_offset, value, size, info, other, shndx = struct.unpack_from( + endian + "IIIBBH", symtab_buf, i * self.symtab.entsize ) elif bitness == 64: - name_offset = int.from_bytes( - symtab_buf[i * self.symtab.entsize : i * self.symtab.entsize + 4], byteorder=endian, signed=False - ) - info = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 4 : i * self.symtab.entsize + 5], - byteorder=endian, - signed=False, - ) - other = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 5 : i * self.symtab.entsize + 6], - byteorder=endian, - signed=False, - ) - shndx = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 6 : i * self.symtab.entsize + 8], - byteorder=endian, - signed=False, - ) - value = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 8 : i * self.symtab.entsize + 16], - byteorder=endian, - signed=False, - ) - size = int.from_bytes( - symtab_buf[i * self.symtab.entsize + 16 : i * self.symtab.entsize + 24], - byteorder=endian, - signed=False, + name_offset, info, other, shndx, value, size = struct.unpack_from( + endian + "IBBHQQ", symtab_buf, i * self.symtab.entsize ) self.symbols.append(Symbol(name_offset, value, size, info, other, shndx)) @@ -829,7 +739,7 @@ def get_symbols(self) -> Iterator[Symbol]: @classmethod def from_viv(cls, elf: "Elf.Elf") -> Optional["SymTab"]: - endian: Literal["big", "little"] = "little" if elf.getEndian() == 0 else "big" + endian = "<" if elf.getEndian() == 0 else ">" bitness = elf.bits SHT_SYMTAB = 0x2 @@ -1124,13 +1034,12 @@ def read_data(elf: ELF, rva: int, size: int) -> Optional[bytes]: def read_go_slice(elf: ELF, rva: int) -> Optional[bytes]: - psize: int = 0 if elf.bitness == 32: struct_size = 8 - psize = 4 + struct_format = elf.endian + "II" elif elf.bitness == 64: struct_size = 16 - psize = 8 + struct_format = elf.endian + "QQ" else: raise ValueError("invalid psize") @@ -1138,8 +1047,7 @@ def read_go_slice(elf: ELF, rva: int) -> Optional[bytes]: if not struct_buf: return None - addr = int.from_bytes(struct_buf[0:psize], byteorder=elf.endian, signed=False) - length = int.from_bytes(struct_buf[psize : psize * 2], byteorder=elf.endian, signed=False) + addr, length = struct.unpack_from(struct_format, struct_buf, 0) return read_data(elf, addr, length) @@ -1188,12 +1096,7 @@ def guess_os_from_go_buildinfo(elf: ELF) -> Optional[OS]: logger.debug("go buildinfo: no buildinfo magic") return None - psize = int.from_bytes( - buf[index + len(BUILDINFO_MAGIC) : index + len(BUILDINFO_MAGIC) + 1], byteorder="little", signed=True - ) - flags = int.from_bytes( - buf[index + len(BUILDINFO_MAGIC) + 1 : index + len(BUILDINFO_MAGIC) + 2], byteorder="little", signed=True - ) + psize, flags = struct.unpack_from(" Optional[OS]: # This is the uncommon path. Most samples will have an inline GOOS string. # # To find samples on VT, use the referenced VTGrep content searches. - # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 00} - # like: 71e617e5cc7fda89bf67422ff60f437e9d54622382c5ed6ff31f75e601f9b22e - # in which the modinfo doesn't have GOOS. - # 4 byte size and little endian - # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 00} - # like: 93d3b3e2a904c6c909e20f2f76c3c2e8d0c81d535eb46e5493b5701f461816c3 - # in which the modinfo doesn't have GOOS. - # 8 byte size and little endian - # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 01} - # (no matches on VT today) - # 4 byte size and little endian - # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 01} - # like: d44ba497964050c0e3dd2a192c511e4c3c4f17717f0322a554d64b797ee4690a - # in which the modinfo doesn't have GOOS. - # 8 byte size and big endian - - endian: Literal["big", "little"] = "big" if is_big_endian else "little" - if psize == 4: - build_version_address = int.from_bytes(buf[index + 0x10 : index + 0x14], byteorder=endian, signed=False) - modinfo_address = int.from_bytes(buf[index + 0x14 : index + 0x18], byteorder=endian, signed=False) - else: # psize == 8 - build_version_address = int.from_bytes(buf[index + 0x10 : index + 0x18], byteorder=endian, signed=False) - modinfo_address = int.from_bytes(buf[index + 0x18 : index + 0x20], byteorder=endian, signed=False) + info_format = { + # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 04 00} + # like: 71e617e5cc7fda89bf67422ff60f437e9d54622382c5ed6ff31f75e601f9b22e + # in which the modinfo doesn't have GOOS. + (4, False): "II", + # content: {ff 20 47 6f 20 62 75 69 6c 64 69 6e 66 3a 08 01} + # like: d44ba497964050c0e3dd2a192c511e4c3c4f17717f0322a554d64b797ee4690a + # in which the modinfo doesn't have GOOS. + (8, True): ">QQ", + } + + build_version_address, modinfo_address = struct.unpack_from( + info_format[(psize, is_big_endian)], buf, index + 0x10 + ) logger.debug("go buildinfo: build version address: 0x%x", build_version_address) logger.debug("go buildinfo: modinfo address: 0x%x", modinfo_address) diff --git a/capa/features/extractors/ghidra/basicblock.py b/capa/features/extractors/ghidra/basicblock.py index 904c20e2e..25b73ee43 100644 --- a/capa/features/extractors/ghidra/basicblock.py +++ b/capa/features/extractors/ghidra/basicblock.py @@ -14,6 +14,7 @@ import string +import struct from typing import Iterator import ghidra @@ -34,13 +35,13 @@ def get_printable_len(op: ghidra.program.model.scalar.Scalar) -> int: op_val = op.getValue() if op_bit_len == 8: - chars = (op_val & 0xFF).to_bytes(1, "little") + chars = struct.pack(" MAX_OFFSET_PE_AFTER_MZ: diff --git a/capa/features/extractors/helpers.py b/capa/features/extractors/helpers.py index f764a64ad..eb546f504 100644 --- a/capa/features/extractors/helpers.py +++ b/capa/features/extractors/helpers.py @@ -13,6 +13,7 @@ # limitations under the License. +import struct import builtins from typing import Iterator @@ -156,7 +157,7 @@ def carve_pe(pbytes: bytes, offset: int = 0) -> Iterator[tuple[int, int]]: if pblen < (e_lfanew + 4): continue - newoff = int.from_bytes(xor_static(pbytes[e_lfanew : e_lfanew + 4], key), "little") + newoff = struct.unpack(" int: op_val = capa.features.extractors.ida.helpers.mask_op_val(op) if op.dtype == idaapi.dt_byte: - chars = (op_val).to_bytes(1, "little") + chars = struct.pack(" Iterator[tuple[int, int]]: if seg_max < (e_lfanew + 4): continue - newoff = int.from_bytes(capa.features.extractors.helpers.xor_static(idc.get_bytes(e_lfanew, 4), i), "little") + newoff = struct.unpack(" MAX_OFFSET_PE_AFTER_MZ: diff --git a/capa/features/extractors/viv/basicblock.py b/capa/features/extractors/viv/basicblock.py index 65c1f7d0d..0f95bdef1 100644 --- a/capa/features/extractors/viv/basicblock.py +++ b/capa/features/extractors/viv/basicblock.py @@ -14,6 +14,7 @@ import string +import struct from typing import Iterator import envi @@ -118,13 +119,13 @@ def get_printable_len(oper: envi.archs.i386.disasm.i386ImmOper) -> int: Return string length if all operand bytes are ascii or utf16-le printable """ if oper.tsize == 1: - chars = (oper.imm).to_bytes(1, "little") + chars = struct.pack(" Date: Fri, 31 Jan 2025 14:44:30 +0530 Subject: [PATCH 5/5] update CHANGELOG Signed-off-by: vibhatsu --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9113f6dac..1f09f8f08 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -28,6 +28,7 @@ - strings: add type hints and fix uncovered bugs @williballenthin #2555 - elffile: handle symbols without a name @williballenthin #2553 - project: remove pytest-cov that wasn't used @williballenthin @2491 +- replace binascii methods with native Python methods @v1bh475u #2582 ### capa Explorer Web