Skip to content

Commit 6117aa6

Browse files
AlonNaor22claude
andcommitted
fix: handle multipart MIME attachments in partition_email
When a PGP-signed email is wrapped in a multipart/mixed envelope, the multipart/signed part appears as an "attachment" via iter_attachments(). Calling get_content() on it raises KeyError because Python's email.contentmanager has no handler for multipart content types. Fix in three layers: - _file_bytes: check is_multipart() first and use as_bytes() to get raw MIME bytes instead of calling get_content() which cannot handle multipart types - _iter_elements: move file creation inside the try/except block so all attachment processing errors go through the same error handler - EXPECTED_ATTACHMENT_ERRORS: add KeyError as a safety net for any remaining unhandled content types from the content manager Closes #3922 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent f6fcba4 commit 6117aa6

File tree

5 files changed

+136
-3
lines changed

5 files changed

+136
-3
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
MIME-Version: 1.0
2+
From: sender@example.com
3+
To: recipient@example.com
4+
Subject: PGP Signed Email
5+
Date: Mon, 10 Feb 2025 12:00:00 +0000
6+
Message-ID: <signed-test@example.com>
7+
Content-Type: multipart/signed; micalg=pgp-sha256;
8+
protocol="application/pgp-signature";
9+
boundary="signed-boundary"
10+
11+
--signed-boundary
12+
Content-Type: text/plain; charset=utf-8
13+
Content-Transfer-Encoding: 7bit
14+
15+
This is a PGP signed email body.
16+
17+
--signed-boundary
18+
Content-Type: application/pgp-signature; name="signature.asc"
19+
Content-Disposition: attachment; filename="signature.asc"
20+
Content-Transfer-Encoding: 7bit
21+
22+
-----BEGIN PGP SIGNATURE-----
23+
iQEzBAABCAAdFiEEfakeSignatureDataHere
24+
-----END PGP SIGNATURE-----
25+
26+
--signed-boundary--
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
MIME-Version: 1.0
2+
From: sender@example.com
3+
To: recipient@example.com
4+
Subject: PGP Signed Email With Attachment
5+
Date: Mon, 10 Feb 2025 12:00:00 +0000
6+
Message-ID: <signed-attach-test@example.com>
7+
Content-Type: multipart/mixed; boundary="outer-boundary"
8+
9+
--outer-boundary
10+
Content-Type: multipart/signed; micalg=pgp-sha256;
11+
protocol="application/pgp-signature";
12+
boundary="signed-boundary"
13+
14+
--signed-boundary
15+
Content-Type: text/plain; charset=utf-8
16+
Content-Transfer-Encoding: 7bit
17+
18+
This is a PGP signed email with an attachment.
19+
20+
--signed-boundary
21+
Content-Type: application/pgp-signature; name="signature.asc"
22+
Content-Disposition: attachment; filename="signature.asc"
23+
Content-Transfer-Encoding: 7bit
24+
25+
-----BEGIN PGP SIGNATURE-----
26+
iQEzBAABCAAdFiEEfakeSignatureDataHere
27+
-----END PGP SIGNATURE-----
28+
29+
--signed-boundary--
30+
31+
--outer-boundary
32+
Content-Type: text/plain; charset=utf-8
33+
Content-Disposition: attachment; filename="note.txt"
34+
Content-Transfer-Encoding: 7bit
35+
36+
This is a text attachment.
37+
38+
--outer-boundary--

test_unstructured/partition/test_email.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,66 @@ def test_partition_email_silently_skips_attachments_it_cannot_partition():
389389
]
390390

391391

392+
def test_partition_email_handles_multipart_signed_attachment():
393+
"""A PGP-signed email wrapped in multipart/mixed does not crash (issue #3922).
394+
395+
When a multipart/signed part appears as an "attachment" inside a multipart/mixed
396+
envelope, ``get_content()`` raises ``KeyError`` because Python's content-manager
397+
has no handler for multipart types. The partitioner should handle this gracefully
398+
by falling back to ``as_bytes()`` in ``_file_bytes``.
399+
"""
400+
import email
401+
import email.policy
402+
403+
from unstructured.partition.email import _AttachmentPartitioner
404+
405+
with open(example_doc_path("eml/mime-signed-with-attachment.eml"), "rb") as f:
406+
msg = email.message_from_binary_file(f, policy=email.policy.default)
407+
408+
ctx = EmailPartitioningContext(
409+
example_doc_path("eml/mime-signed-with-attachment.eml"),
410+
process_attachments=True,
411+
)
412+
413+
for att in msg.iter_attachments():
414+
if att.is_multipart():
415+
partitioner = _AttachmentPartitioner(att, ctx)
416+
# -- this used to raise KeyError: 'multipart/signed' --
417+
file_bytes = partitioner._file_bytes
418+
assert isinstance(file_bytes, bytes)
419+
assert len(file_bytes) > 0
420+
assert b"PGP signed email" in file_bytes
421+
break
422+
else:
423+
pytest.fail("No multipart attachment found in test email")
424+
425+
426+
def test_partition_email_file_bytes_works_for_non_multipart():
427+
"""_file_bytes still works normally for regular (non-multipart) attachments."""
428+
import email
429+
import email.policy
430+
431+
from unstructured.partition.email import _AttachmentPartitioner
432+
433+
with open(example_doc_path("eml/mime-signed-with-attachment.eml"), "rb") as f:
434+
msg = email.message_from_binary_file(f, policy=email.policy.default)
435+
436+
ctx = EmailPartitioningContext(
437+
example_doc_path("eml/mime-signed-with-attachment.eml"),
438+
process_attachments=True,
439+
)
440+
441+
for att in msg.iter_attachments():
442+
if not att.is_multipart():
443+
partitioner = _AttachmentPartitioner(att, ctx)
444+
file_bytes = partitioner._file_bytes
445+
assert isinstance(file_bytes, bytes)
446+
assert b"text attachment" in file_bytes
447+
break
448+
else:
449+
pytest.fail("No non-multipart attachment found in test email")
450+
451+
392452
# ================================================================================================
393453
# ISOLATED UNIT TESTS
394454
# ================================================================================================

unstructured/partition/common/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,5 @@ class UnsupportedFileFormatError(Exception):
1616
UnsupportedFileFormatError,
1717
ImportError,
1818
FileNotFoundError,
19+
KeyError,
1920
)

unstructured/partition/email.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -392,10 +392,9 @@ def _iter_elements(self) -> Iterator[Element]:
392392
# -- avoid a circular import.
393393
from unstructured.partition.auto import partition
394394

395-
file = io.BytesIO(self._file_bytes)
396-
397395
# -- partition the attachment --
398396
try:
397+
file = io.BytesIO(self._file_bytes)
399398
elements = partition(
400399
file=file,
401400
metadata_filename=self._attachment_file_name,
@@ -431,7 +430,16 @@ def _attachment_file_name(self) -> str | None:
431430

432431
@lazyproperty
433432
def _file_bytes(self) -> bytes:
434-
"""The bytes of the attached file."""
433+
"""The bytes of the attached file.
434+
435+
For multipart MIME parts (e.g. multipart/signed in PGP-signed emails),
436+
``get_content()`` raises ``KeyError`` because Python's content-manager has
437+
no handler for multipart types. In that case we fall back to the raw MIME
438+
bytes of the part so downstream partitioners can still attempt to process it.
439+
"""
440+
if self._attachment.is_multipart():
441+
return self._attachment.as_bytes()
442+
435443
content = self._attachment.get_content()
436444

437445
if isinstance(content, str):

0 commit comments

Comments
 (0)