Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions python/unblob/handlers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
cab,
cpio,
dmg,
msi,
partclone,
rar,
sevenzip,
Expand Down Expand Up @@ -88,6 +89,7 @@
arc.ARCHandler,
arj.ARJHandler,
cab.CABHandler,
msi.MsiHandler,
tar.TarUstarHandler,
tar.TarUnixHandler,
cpio.PortableASCIIHandler,
Expand Down
71 changes: 71 additions & 0 deletions python/unblob/handlers/archive/msi.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""MSI Handler

Extracts uses 7z for now. Could migrate to fully implementation:

https://github.com/nightlark/pymsi
"""

from typing import Optional
import io

import pymsi
from structlog import get_logger

from unblob.extractors import Command

from ...models import (
File,
Handler,
HandlerDoc,
HandlerType,
HexString,
Reference,
ValidChunk,
)

logger = get_logger()


class MsiHandler(Handler):
NAME = "msi"

PATTERNS = [
HexString("D0 CF 11 E0 A1 B1 1A E1")
]
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")

DOC = HandlerDoc(
name="MSI",
description="Microsoft Installer (MSI) files are used for the installation, maintenance, and removal of software.",
handler_type=HandlerType.ARCHIVE,
vendor="Microsoft",
references=[
Reference(
title="MSI File Format Documentation",
url="https://docs.microsoft.com/en-us/windows/win32/msi/overview-of-windows-installer",
)
],
limitations=[],
)

def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
file.seek(start_offset, io.SEEK_SET)

try:
# TODO: pymsi wants a path or BytesIO
buf = io.BytesIO()
buf.write(file[:])
buf.seek(0)

package = pymsi.Package(buf)
msi = pymsi.Msi(package, True)
except Exception:
return None

# MSI moves the file pointer
msi_end_offset = buf.tell()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is not right. If you look at the output directory when run on the 7z MSI, you'll see that it carves two chunks:

0-1545728.msi
1545728-1563648.unknown

Looking into the unknown chunk, we see information that belongs to the Summary field:

hexdump -C 1545728-1563648.unknown
--snip--
00004470  1e 00 00 00 16 00 00 00  49 6e 73 74 61 6c 6c 61  |........Installa|
00004480  74 69 6f 6e 20 44 61 74  61 62 61 73 65 00 00 00  |tion Database...|
00004490  1e 00 00 00 0e 00 00 00  37 2d 5a 69 70 20 50 61  |........7-Zip Pa|
000044a0  63 6b 61 67 65 00 00 00  1e 00 00 00 0c 00 00 00  |ckage...........|
000044b0  49 67 6f 72 20 50 61 76  6c 6f 76 00 1e 00 00 00  |Igor Pavlov.....|
000044c0  0a 00 00 00 49 6e 73 74  61 6c 6c 65 72 00 00 00  |....Installer...|
000044d0  1e 00 00 00 0e 00 00 00  37 2d 5a 69 70 20 50 61  |........7-Zip Pa|
000044e0  63 6b 61 67 65 00 00 00  1e 00 00 00 0b 00 00 00  |ckage...........|
000044f0  49 6e 74 65 6c 3b 31 30  33 33 00 00 1e 00 00 00  |Intel;1033......|
00004500  27 00 00 00 7b 32 33 31  37 30 46 36 39 2d 34 30  |'...{23170F69-40|
00004510  43 31 2d 32 37 30 31 2d  32 35 30 31 2d 30 30 30  |C1-2701-2501-000|
00004520  30 30 32 30 30 30 30 30  30 7d 00 00 03 00 00 00  |002000000}......|
00004530  c8 00 00 00 03 00 00 00  02 00 00 00 03 00 00 00  |................|
00004540  02 00 00 00 40 00 00 00  80 8a 97 7e 8e 04 dc 01  |....@......~....|
00004550  40 00 00 00 80 8a 97 7e  8e 04 dc 01 1e 00 00 00  |@......~........|
00004560  31 00 00 00 57 69 6e 64  6f 77 73 20 49 6e 73 74  |1...Windows Inst|
00004570  61 6c 6c 65 72 20 58 4d  4c 20 76 32 2e 30 2e 33  |aller XML v2.0.3|
00004580  37 31 39 2e 30 20 28 63  61 6e 64 6c 65 2f 6c 69  |719.0 (candle/li|
00004590  67 68 74 29 00 00 00 00  00 00 00 00 00 00 00 00  |ght)............|
000045a0  00 00 00 00 00 00 00 00  00 00 00 00 00 00 00 00  |................|

You can check that by opening the file in https://pymsi.readthedocs.io/en/latest/msi_viewer.html and looking at the Summary tab.

I know very little about the MSI format, but looks like the end offset could be calculated based on the OLE format that the MSI is made off. Probably some magic involving sector sizes and sector counts.


return ValidChunk(
start_offset = start_offset,
end_offset = msi_end_offset,
)
4 changes: 3 additions & 1 deletion python/unblob/processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@
DEFAULT_PROCESS_NUM = multiprocessing.cpu_count()
DEFAULT_SKIP_MAGIC = (
"BFLT",
"Composite Document File V2 Document",
# TODO: Need to disable this for MSI but does it need to be enabled for
# other types of Composite Documents?
#"Composite Document File V2 Document",
"Erlang BEAM file",
"GIF",
"GNU message catalog",
Expand Down