From 1954bc9166865096348e9c5c53e019c0dc8d55bb Mon Sep 17 00:00:00 2001 From: Emiliano Testa Date: Fri, 5 Apr 2024 09:46:51 +0100 Subject: [PATCH 1/2] Extended event log, support for file operations (open, close, read, write) An extended event log helps comparing recordings of different runs of the same process. For this reason the log is output in json format and the comparison can be done by another script that understands the format and knows what to look for. There is a long list of TODOs at the top of the python script before this can be considered complete. --- extended_evt_log/README.md | 22 ++ extended_evt_log/extended_evt_log.py | 356 +++++++++++++++++++++++++++ extended_evt_log/schema.json | 72 ++++++ 3 files changed, 450 insertions(+) create mode 100644 extended_evt_log/README.md create mode 100644 extended_evt_log/extended_evt_log.py create mode 100644 extended_evt_log/schema.json diff --git a/extended_evt_log/README.md b/extended_evt_log/README.md new file mode 100644 index 0000000..ac22300 --- /dev/null +++ b/extended_evt_log/README.md @@ -0,0 +1,22 @@ +# Extended event log + +Capture all parameters and results of a number of syscalls. +The objective is to make it easier to compare different runs of the same process. + +## Usage + +``` +extended-evt-log [-output OUTPUT-PATH] +``` + +Before using the script it must be loaded in to the debugger: +``` +source PATHTOADDONS/extended_evt_log/extended_evt_log.py +``` + +### Optional arguments + +- `-output OUTPUT-PATH`, `-o OUTPUT-PATH`: + Path to a file were to write the reconstructed file. If not specified, the + content is printed on standard output. + diff --git a/extended_evt_log/extended_evt_log.py b/extended_evt_log/extended_evt_log.py new file mode 100644 index 0000000..a1e62b7 --- /dev/null +++ b/extended_evt_log/extended_evt_log.py @@ -0,0 +1,356 @@ +""" +Reconstruct the content of a socket communication by going through all events +that used the file descriptor associated to the socket. + +To use, load this file in UDB (see the `source` command). + +See `help rebuild-comms` for usage information. + +Contributors: Emiliano Testa +""" + +import argparse +import json +from os import remove +#import re +import sys +import textwrap +#from enum import Enum +from pathlib import Path +from typing import Iterator, NoReturn, Optional + +import gdb +import pdb +from undodb.debugger_extensions import debugger_utils, udb#,debugger_io + + +def iterate_events(condition: str) -> Iterator[None]: + """ + Stops at all events matching `condition`. + + See `ugo event next`. + """ + while True: + event_next_output = debugger_utils.execute_to_string(f"ugo event next {condition}") + if "No matching event" in event_next_output: + break + yield + + +def get_syscall_argument(index: int) -> gdb.Value: + """ + Returns a :class:`gdb.Value` representing an argument to a syscall. + + If `index` is 0 the first argument is returned, and so on. + + Execution must be stopped at a `syscall` instruction so all registers are set up for the + syscall. + """ + syscall_args = ["rdi", "rsi", "rdx", "r10", "r8", "r9"] + reg_name = syscall_args[index] + return gdb.selected_frame().read_register(reg_name) + + +def get_syscall_name() -> str: + """ + Returns the name of the syscall being executed at the current time. + + Only the few syscalls required by this file as currently supported. + + Execution must be stopped at a `syscall` instruction so all registers are set up for the + syscall. + """ + syscall_number = int(gdb.selected_frame().read_register("eax")) + try: + return { + 0: "read", + 1: "write", + 2: "open", + 3: "close", + 17: "pread64", + 18: "pwrite64", + 41: "socket", + 43: "accept", + 44: "sendto", + 45: "recvfrom", + 46: "sendmsg", + 47: "recvmsg", + 49: "bind", + 50: "listen", + 257: "openat", + 299: "recvmmsg", + 307: "sendmmsg", + }[syscall_number] + except KeyError as exc: + raise gdb.GdbError(f"Encountered unknown syscall {syscall_number}.") from exc + +def read_memory(address: gdb.Value, size: int) -> str: + inferior = gdb.selected_inferior() + mem = inferior.read_memory(address, size).tobytes() + return ''.join('{:02x}'.format(b) for b in mem) + +def get_syscall_result() -> gdb.Value: + """ + Returns the result of a syscall being executed at the current time. + + To do so, execution is moved to just after the syscall returns. + + Execution must be stopped at a `syscall` instruction. + """ + debugger_utils.execute_to_string("nexti") + return gdb.selected_frame().read_register("eax") + +class RebuildSocketComms(gdb.Command): + """ + Command which rebuilds all socket communications performed by a debugged program from execution + history. + + See `help rebuid-comms` for details on usage. + """ + + def __init__(self) -> None: + name = "extended-ev-log" + + # Force the width to fit of help messages to fit in 80 columns to match GDB's behaviour. + class HelpFormatter(argparse.HelpFormatter): + def __init__(self, prog, indent_increment=2, max_help_position=24, width=80): + super().__init__(prog, indent_increment, max_help_position, width) + + super().__init__(name, gdb.COMMAND_USER) + self.parser = argparse.ArgumentParser( + prog=name, + # Do not add support for -h / --help. The user can use the help command. + add_help=False, + formatter_class=HelpFormatter, + description=""" + Reconstruct all socket communications performed by the target program by examining + its execution history. + """, + ) + + # argparse.ArgumentParser.exit calls sys.exit which, inside GDB, causes the inferior to + # be detached. + # We redefine the function to behave the same, except that a SystemExit is raised instead. + # This won't be needed in Python 3.9 where the exit_on_error=False parameter can be set + # when initialising the parser. + def fake_exit(status: int = 0, message: Optional[str] = None) -> NoReturn: + if message is not None: + print(message, file=sys.stderr) + raise SystemExit(status) + + self.parser.exit = fake_exit # type: ignore + + self.setup_evt_dict() + self.stream_cnt = 0 + self.stream_storage = {} + self.char_p = gdb.lookup_type("char").pointer() + selection_group = self.parser.add_mutually_exclusive_group() + selection_group.add_argument( + "-regex", + dest="path_pattern", + metavar="PATH-REGEX", + default=".*", + help=""" + A regular expression matching the path of the file to reconstruct. + Only the first file matching the regular expression is considered. + """, + ) + selection_group.add_argument( + "-fd", + metavar="FILE-DESCRIPTOR", + type=int, + help=""" + The file descriptor of the file to reconstruct. + """, + ) + self.parser.add_argument( + "-from-start", + action="store_true", + help=""" + By default, the file is reconstructed starting at the current time in execution + history. + With this flag, the execution history is considered from its beginning. + """, + ) + self.parser.add_argument( + "-output", + "-o", + metavar="OUTPUT-PATH", + help=""" + Path to a file were to write the reconstructed file. + If not specified, the content is printed on standard output. + """, + ) + + self.__doc__ = ( + self.parser.format_help() + + textwrap.dedent( + """\ + + Limitations: + - Only 64-bit x86 is supported. + - Only files which are read in their entirety can be fully reconstructed. + - Seeks in files being read are ignored. If the target program uses fseek or + similar, then the file won't be reconstructed correctly. + - Regular expressions matching the whole path (including directories) may + not match opened files correctly due to path manipulation in the target + program. + - Signals may cause the command to fail in unexpected ways. + """ + ).rstrip() + ) + + def get_stream_for_fd(self, fd: int, remove_from_count: bool = False) -> str: + for s_name, s_fd in self.stream_storage.items(): + if fd == s_fd: + if remove_from_count: + del self.stream_storage[s_name] + return s_name + s_name = f'stream_{self.stream_cnt}' + self.stream_cnt += 1 + self.stream_storage[s_name] = fd + return s_name + + def setup_evt_dict(self): + self.evt_dict = {} + self.evt_dict["title"] = "Extended event log" + self.evt_dict["description"] = "Allows easier parsing of events from recordings" + self.evt_dict["streams"] = {} + + def handle_write(self, write_type: str) -> None: + # advance to the "syscall" instruction + fd = int(get_syscall_argument(0)) + data_address = get_syscall_argument(1).cast(self.char_p) + data_len = int(get_syscall_argument(2)) + write_op = {} + write_op["type"] = write_type + if 'pwrite' in write_type: + write_op["offset"] = int(get_syscall_argument(3)) + current_time = udb.time.get().bbcount + result = int(get_syscall_result()) + if result > 0: + write_op["data"] = read_memory(data_address, result) + else: + write_op["data"] = 0 + write_op["data_size"] = data_len + write_op["result"] = result + write_op["bbcount"] = current_time + s_name = self.get_stream_for_fd(fd) + try: + file_stream = self.evt_dict["streams"][s_name] + except KeyError: + file_stream = { + "file_name": "", + "writes": [] + } + try: + file_stream["writes"].append(write_op) + except KeyError: + file_stream["writes"] = [write_op] + self.evt_dict["streams"][s_name] = file_stream + + def handle_read(self, read_type: str) -> None: + # advance to the "syscall" instruction + fd = int(get_syscall_argument(0)) + data_address = get_syscall_argument(1).cast(self.char_p) + data_len = int(get_syscall_argument(2)) + read_op = {} + if 'pread' in read_type: + read_op["offset"] = int(get_syscall_argument(3)) + current_time = udb.time.get().bbcount + result = int(get_syscall_result()) + read_op["type"] = read_type + if result > 0: + read_op["data"] = read_memory(data_address, result) + else: + read_op["data"] = 0 + read_op["data_size"] = data_len + read_op["result"] = result + read_op["bbcount"] = current_time + s_name = self.get_stream_for_fd(fd) + try: + file_stream = self.evt_dict["streams"][s_name] + except KeyError: + file_stream = { + "file_name": "", + "reads": [] + } + try: + file_stream["reads"].append(read_op) + except KeyError: + file_stream["reads"] = [read_op] + self.evt_dict["streams"][s_name] = file_stream + + def handle_close(self, syscall_name: str) -> None: + fd = int(get_syscall_argument(0)) + current_time = udb.time.get().bbcount + s_name = self.get_stream_for_fd(fd, remove_from_count=True) + result = int(get_syscall_result()) + try: + file_stream = self.evt_dict["streams"][s_name] + file_stream["close"] = { + "fd": fd, + "result": result, + "bbcount": current_time + } + except KeyError: + file_stream = { + "file_name": "", + "close": { + "fd": fd, + "result": result, + "bbcount": current_time + } + } + self.evt_dict["streams"][s_name] = file_stream + + def handle_open(self, syscall_name: str) -> None: + fname_idx = 0 + if syscall_name == 'openat': + fname_idx = 1 + fname = get_syscall_argument(fname_idx).cast(self.char_p).string() + current_time = udb.time.get().bbcount + fd = int(get_syscall_result()) + file_stream = { + "file_name": fname, + syscall_name: { + "result": fd, + "bbcount": current_time + } + } + s_name = self.get_stream_for_fd(fd) + self.evt_dict["streams"][s_name] = file_stream + + def invoke(self, args: str, from_tty: bool) -> None: + try: + opts = self.parser.parse_args(gdb.string_to_argv(args)) + except SystemExit: + # TODO: once we depend on Python 3.9, use exit_on_error=False when initialising the + # parser rather than catching this exception. + return + + with debugger_utils.breakpoints_suspended(), udb.time.auto_reverting(): + udb.time.goto_start() + for _ in iterate_events("name in ('write', 'pwrite64', 'open', 'openat', 'close', 'read', 'pread64')"): + syscall_name = get_syscall_name() + pdb.set_trace() + if 'write' in syscall_name: + self.handle_write(syscall_name) + elif 'open' in syscall_name: + self.handle_open(syscall_name) + elif syscall_name == 'close': + self.handle_close(syscall_name) + elif 'read' in syscall_name: + self.handle_read(syscall_name) + + print(f"{opts.output=}") + if opts.output: + try: + with Path(opts.output).open('w') as json_file: + json.dump(self.evt_dict, json_file, indent=4) + except IOError as exc: + raise gdb.GdbError("Cannot write output: {exc}") from exc + else: + print(json.dumps(self.evt_dict, indent=4)) + +RebuildSocketComms() diff --git a/extended_evt_log/schema.json b/extended_evt_log/schema.json new file mode 100644 index 0000000..091220e --- /dev/null +++ b/extended_evt_log/schema.json @@ -0,0 +1,72 @@ +{ + "title": "Extended event log", + "description": "easier comparison of recordings", + "streams": { + "name": "string", + "fd": { + "open": { + "result": "integer", + "bbcount": "integer" + }, + "close": { + "result": "integer", + "bbcount": "integer" + }, + "reads": [ + { + "type": "string", <-- this defines the type of syscall: read, pread64, readv and so on + "data": "bytes", + "data_size": "integer", + "result": "integer", + "bbcount": "integer" + }, + { + "type": "string", <-- this defines the type of syscall: read, pread64, readv and so on + "data": "bytes", + "data_size": "integer", + "result": "integer", + "bbcount": "integer" + }, + ], + "writes": [ + { + "type": "string", <-- this defines the type of syscall: write, pwrite64, writev and so on + "data": "bytes", + "data_size": "integer", + "result": "integer", + "bbcount": "integer" + }, + { + "type": "string", <-- this defines the type of syscall: write, pwrite64, writev and so on + "data": "bytes", + "data_size": "integer", + "result": "integer", + "bbcount": "integer" + }, + ], + }, + "socket1": { + "fd": "integer", + "socket": {}, + "bind": {}, + "accept": {}, + "recv": [ + { + "params_1": + }, + { + "params_2": + }, + ], + "send": [ + { + "params_1": + }, + { + "params_2": + }, + ], + } + ] +} + From 603fe16828d3a2cd5982553e250064dafd18b5f8 Mon Sep 17 00:00:00 2001 From: Emiliano Testa Date: Fri, 5 Apr 2024 09:54:06 +0100 Subject: [PATCH 2/2] Extended evt log, added TODO and improved README --- extended_evt_log/extended_evt_log.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/extended_evt_log/extended_evt_log.py b/extended_evt_log/extended_evt_log.py index a1e62b7..b3b7fcc 100644 --- a/extended_evt_log/extended_evt_log.py +++ b/extended_evt_log/extended_evt_log.py @@ -4,14 +4,20 @@ To use, load this file in UDB (see the `source` command). -See `help rebuild-comms` for usage information. +See `help extended-evt-log` for usage information. Contributors: Emiliano Testa + + +TODO: + 1 - Add tests for what is present + 2 - refactor the code to remove the boilerplate / have a central place for key names + 3 - Add lots more syscalls (all the ones that use a fd) + 4 - Think of a way to have a "stream" for syscalls WITHOUT a fd """ import argparse import json -from os import remove #import re import sys import textwrap @@ -109,7 +115,7 @@ class RebuildSocketComms(gdb.Command): """ def __init__(self) -> None: - name = "extended-ev-log" + name = "extended-evt-log" # Force the width to fit of help messages to fit in 80 columns to match GDB's behaviour. class HelpFormatter(argparse.HelpFormatter):