diff --git a/python/unblob/file_utils.py b/python/unblob/file_utils.py index dfdf1cb5ae..53868cb1a0 100644 --- a/python/unblob/file_utils.py +++ b/python/unblob/file_utils.py @@ -12,7 +12,7 @@ import unicodedata from collections.abc import Iterable, Iterator from pathlib import Path -from typing import Literal, Optional, Union +from typing import Literal, Optional, Protocol, Union from dissect.cstruct import cstruct from structlog import get_logger @@ -269,8 +269,15 @@ def iterate_patterns( file.seek(initial_position) +class RandomReader(Protocol): + # File implements this interface + + def read(self, n: Optional[int] = None) -> bytes: ... + def seek(self, pos: int, whence: int = io.SEEK_SET) -> int: ... + + def iterate_file( - file: File, + file: RandomReader, start_offset: int, size: int, # default buffer size in shutil for unix based systems @@ -297,7 +304,7 @@ def iterate_file( yield data -def carve(carve_path: Path, file: File, start_offset: int, size: int): +def carve(carve_path: Path, file: RandomReader, start_offset: int, size: int): """Extract part of a file.""" carve_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/python/unblob/handlers/archive/ar.py b/python/unblob/handlers/archive/ar.py index 3e90a6343f..5c54cf5fb9 100644 --- a/python/unblob/handlers/archive/ar.py +++ b/python/unblob/handlers/archive/ar.py @@ -1,12 +1,14 @@ +import io import os +from pathlib import Path from typing import Optional import arpy from structlog import get_logger -from ...extractors import Command -from ...file_utils import OffsetFile -from ...models import File, Handler, HexString, ValidChunk +from ...file_utils import FileSystem, OffsetFile, iterate_file +from ...models import Extractor, ExtractResult, File, Handler, HexString, ValidChunk +from ...report import ExtractionProblem logger = get_logger() @@ -15,6 +17,56 @@ SIGNATURE_LENGTH = 0x8 +class RandomReader: + """Adapter for file_utils.RandomReader. + + Changes the parameter names, as they are different for arpy and unblob.File. + """ + + def __init__(self, arpy_file: arpy.ArchiveFileData): + self._arpy_file = arpy_file + + def read(self, n: Optional[int] = None) -> bytes: + return self._arpy_file.read(n) + + def seek(self, pos: int, whence: int = io.SEEK_SET) -> int: + return self._arpy_file.seek(pos, whence) + + +class ArExtractor(Extractor): + def extract(self, inpath: Path, outdir: Path) -> Optional[ExtractResult]: + fs = FileSystem(outdir) + + with arpy.Archive(inpath.as_posix()) as archive: + archive.read_all_headers() + + for name in sorted(archive.archived_files): + archived_file = archive.archived_files[name] + + try: + path = Path(name.decode()) + except UnicodeDecodeError: + path = Path(name.decode(errors="replace")) + fs.record_problem( + ExtractionProblem( + path=repr(name), + problem="Path is not a valid UTF/8 string", + resolution=f"Converted to {path}", + ) + ) + + fs.write_chunks( + path, + chunks=iterate_file( + RandomReader(archived_file), + 0, + archived_file.header.size, + ), + ) + + return ExtractResult(reports=fs.problems) + + class ARHandler(Handler): NAME = "ar" @@ -27,7 +79,7 @@ class ARHandler(Handler): ) ] - EXTRACTOR = Command("unar", "-no-directory", "-o", "{outdir}", "{inpath}") + EXTRACTOR = ArExtractor() def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]: offset_file = OffsetFile(file, start_offset) diff --git a/tests/integration/archive/ar/__input__/bsd_mixed.ar b/tests/integration/archive/ar/__input__/bsd_mixed.ar new file mode 100644 index 0000000000..245da87f11 --- /dev/null +++ b/tests/integration/archive/ar/__input__/bsd_mixed.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1822aa7c0a030fa6ae9ea664b4ba2d804bb0bf5014cc7366d199374a1d64cc7c +size 358 diff --git a/tests/integration/archive/ar/__input__/bsd_multi_names.ar b/tests/integration/archive/ar/__input__/bsd_multi_names.ar new file mode 100644 index 0000000000..72127d3930 --- /dev/null +++ b/tests/integration/archive/ar/__input__/bsd_multi_names.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b57b0fc4cff4d4c8c37dbbc1f2f51ed49859f90a931da3465bb27b7ea75a412 +size 552 diff --git a/tests/integration/archive/ar/__input__/bsd_single_name.ar b/tests/integration/archive/ar/__input__/bsd_single_name.ar new file mode 100644 index 0000000000..cdff59a1e8 --- /dev/null +++ b/tests/integration/archive/ar/__input__/bsd_single_name.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d74d5cbf166638434c696563f88b13db1438d34f8e58675ae6bcf337edbcf9c +size 298 diff --git a/tests/integration/archive/ar/__input__/contents.ar b/tests/integration/archive/ar/__input__/contents.ar new file mode 100644 index 0000000000..e5dd87df8e --- /dev/null +++ b/tests/integration/archive/ar/__input__/contents.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34c790979dc722273410bc18d2d83712f763e111aa252f4f248d7785a5b2014c +size 160 diff --git a/tests/integration/archive/ar/__input__/empty.ar b/tests/integration/archive/ar/__input__/empty.ar new file mode 100644 index 0000000000..eab6e4c5b2 --- /dev/null +++ b/tests/integration/archive/ar/__input__/empty.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0a17a43c74d2fe5474fa2fd29c8f14799e777d7d75a2cc4d11c20a6e7b161c5 +size 8 diff --git a/tests/integration/archive/ar/__input__/gnu_mixed.ar b/tests/integration/archive/ar/__input__/gnu_mixed.ar new file mode 100644 index 0000000000..7c8406a8c8 --- /dev/null +++ b/tests/integration/archive/ar/__input__/gnu_mixed.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a43964b9c61030b12b2320f7782e25f2bab411e0529b85b7e92da969ca1d92a9 +size 274 diff --git a/tests/integration/archive/ar/__input__/gnu_multi_names.ar b/tests/integration/archive/ar/__input__/gnu_multi_names.ar new file mode 100644 index 0000000000..8586e35101 --- /dev/null +++ b/tests/integration/archive/ar/__input__/gnu_multi_names.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aee5062fe584e07862f99d75a4a28613cff6b261d49eab336d59273268fa6d8 +size 372 diff --git a/tests/integration/archive/ar/__input__/gnu_single_name.ar b/tests/integration/archive/ar/__input__/gnu_single_name.ar new file mode 100644 index 0000000000..da5efd56f9 --- /dev/null +++ b/tests/integration/archive/ar/__input__/gnu_single_name.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c733485cdc5c60af424e0a926b7c01752bb20ce0683bcb24433f38450d3c4684 +size 214 diff --git a/tests/integration/archive/ar/__input__/msvc_lib.ar b/tests/integration/archive/ar/__input__/msvc_lib.ar new file mode 100644 index 0000000000..1edf700e14 --- /dev/null +++ b/tests/integration/archive/ar/__input__/msvc_lib.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28e538947fa35b36f4e6c626b8eb32b4544bea99fcdd6ed26975cb47a031804 +size 372 diff --git a/tests/integration/archive/ar/__input__/normal.ar b/tests/integration/archive/ar/__input__/normal.ar new file mode 100644 index 0000000000..593c534196 --- /dev/null +++ b/tests/integration/archive/ar/__input__/normal.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c8382a4b47bb6dc0281179d49631cdc9c45df96676f01c61b81b32bfcbaccb +size 68 diff --git a/tests/integration/archive/ar/__input__/sym.ar b/tests/integration/archive/ar/__input__/sym.ar new file mode 100644 index 0000000000..7592847c25 --- /dev/null +++ b/tests/integration/archive/ar/__input__/sym.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:604b7db9192bf1b3c6dc6e2d7d923a789fa418f9bd56c688af0dd14e6546541b +size 132 diff --git a/tests/integration/archive/ar/__input__/windows.ar b/tests/integration/archive/ar/__input__/windows.ar new file mode 100644 index 0000000000..d8bb45bf79 --- /dev/null +++ b/tests/integration/archive/ar/__input__/windows.ar @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebacc15fe4c59566f2967457491170853df8be4275bd34fcd140f967b0c59807 +size 68 diff --git a/tests/integration/archive/ar/__output__/bsd_mixed.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length b/tests/integration/archive/ar/__output__/bsd_mixed.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/bsd_mixed.ar_extract/short b/tests/integration/archive/ar/__output__/bsd_mixed.ar_extract/short new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/bsd_multi_names.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length b/tests/integration/archive/ar/__output__/bsd_multi_names.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/bsd_multi_names.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length_with_space b/tests/integration/archive/ar/__output__/bsd_multi_names.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length_with_space new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/bsd_single_name.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length b/tests/integration/archive/ar/__output__/bsd_single_name.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/contents.ar_extract/file1 b/tests/integration/archive/ar/__output__/contents.ar_extract/file1 new file mode 100644 index 0000000000..38f1bc1775 --- /dev/null +++ b/tests/integration/archive/ar/__output__/contents.ar_extract/file1 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:732a50d53e827bf15e8cbac02f8b56344cf0443973abfd584476d4d1d90020ba +size 15 diff --git a/tests/integration/archive/ar/__output__/contents.ar_extract/file2 b/tests/integration/archive/ar/__output__/contents.ar_extract/file2 new file mode 100644 index 0000000000..29002ba10b --- /dev/null +++ b/tests/integration/archive/ar/__output__/contents.ar_extract/file2 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf491d05764dd649d2b48426b670fb992e0e4d688c6a91da5b80e7b56aae7a61 +size 15 diff --git a/tests/integration/archive/ar/__output__/gnu_mixed.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length b/tests/integration/archive/ar/__output__/gnu_mixed.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/gnu_mixed.ar_extract/short b/tests/integration/archive/ar/__output__/gnu_mixed.ar_extract/short new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/gnu_multi_names.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length b/tests/integration/archive/ar/__output__/gnu_multi_names.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/gnu_multi_names.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length_with_space b/tests/integration/archive/ar/__output__/gnu_multi_names.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length_with_space new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/gnu_single_name.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length b/tests/integration/archive/ar/__output__/gnu_single_name.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/msvc_lib.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length b/tests/integration/archive/ar/__output__/msvc_lib.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/msvc_lib.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length_with_space b/tests/integration/archive/ar/__output__/msvc_lib.ar_extract/a_very_long_name_for_the_gnu_type_header_so_it_can_overflow_the_standard_name_length_with_space new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/normal.ar_extract/short b/tests/integration/archive/ar/__output__/normal.ar_extract/short new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/sym.ar_extract/a.o b/tests/integration/archive/ar/__output__/sym.ar_extract/a.o new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/archive/ar/__output__/windows.ar_extract/short b/tests/integration/archive/ar/__output__/windows.ar_extract/short new file mode 100644 index 0000000000..e69de29bb2