diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 33806e1..4b8220f 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -18,6 +18,9 @@ from pathlib import Path from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse from warnings import warn, resetwarnings, catch_warnings +from email import policy +from email.parser import Parser +from email.utils import parseaddr import mammoth import markdownify @@ -1189,6 +1192,96 @@ def _get_stream_data( return None +class EmlConverter(DocumentConverter): + """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info.""" + + def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult: + """Convert an EML file to markdown. + + Args: + local_path: Path to the EML file + **kwargs: Additional arguments (unused) + + Returns: + DocumentConverterResult containing the converted markdown + """ + # Check if this is an EML file + file_ext = kwargs.get("file_extension", "").lower() + if not file_ext.endswith(".eml"): + return None + + with open(local_path, "r", encoding="utf-8") as fp: + # Use policy=default to handle RFC compliant emails + msg = Parser(policy=policy.default).parse(fp) + + # Initialize result with email subject as title + result = DocumentConverterResult(title=msg.get("subject", "Untitled Email")) + + # Build markdown content + md_parts = [] + + # Add email headers + md_parts.append("## Email Headers\n") + + # From and To in a more readable format + from_name, from_email = parseaddr(msg.get("from", "")) + to_name, to_email = parseaddr(msg.get("to", "")) + + md_parts.append( + f"**From:** {from_name} <{from_email}>" + if from_name + else f"**From:** {from_email}" + ) + md_parts.append( + f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}" + ) + md_parts.append(f"**Subject:** {msg.get('subject', '')}") + md_parts.append(f"**Date:** {msg.get('date', '')}") + + # Add CC if present + if msg.get("cc"): + md_parts.append(f"**CC:** {msg.get('cc')}") + + md_parts.append("\n## Email Content\n") + + # Handle the email body + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_type() == "text/plain": + md_parts.append(part.get_content()) + elif part.get_content_type() == "text/html": + # If we have HTML content but no plain text, we could convert HTML to markdown here + # For now, we'll just note it's HTML content + if not any( + p.get_content_type() == "text/plain" for p in msg.walk() + ): + md_parts.append(part.get_content()) + else: + md_parts.append(msg.get_content()) + + # List attachments if any + attachments = [] + if msg.is_multipart(): + for part in msg.walk(): + if part.get_content_disposition() == "attachment": + filename = part.get_filename() + if filename: + size = len(part.get_content()) + mime_type = part.get_content_type() + attachments.append( + f"- {filename} ({mime_type}, {size:,} bytes)" + ) + + if attachments: + md_parts.append("\n## Attachments\n") + md_parts.extend(attachments) + + # Combine all parts + result.text_content = "\n".join(md_parts) + + return result + + class ZipConverter(DocumentConverter): """Converts ZIP files to markdown by extracting and converting all contained files. @@ -1405,6 +1498,7 @@ def __init__( self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) self.register_page_converter(OutlookMsgConverter()) + self.register_page_converter(EmlConverter()) def convert( self, source: Union[str, requests.Response, Path], **kwargs: Any diff --git a/tests/test_files/test.eml b/tests/test_files/test.eml new file mode 100644 index 0000000..15f6b85 --- /dev/null +++ b/tests/test_files/test.eml @@ -0,0 +1,33 @@ +Content-Type: multipart/mixed; boundary="===============8484938434343225034==" +MIME-Version: 1.0 +Subject: Test Email Document +From: John Doe +To: Jane Smith +Date: Wed, 18 Dec 2024 10:00:00 +0000 +CC: cc.person@example.com + +--===============8484938434343225034== +Content-Type: text/plain; charset="us-ascii" +MIME-Version: 1.0 +Content-Transfer-Encoding: 7bit + + +This is a test email with multiple parts. + +It contains: +- Plain text content +- An attachment +- Various headers + +Best regards, +John Doe + +--===============8484938434343225034== +Content-Type: application/txt +MIME-Version: 1.0 +Content-Transfer-Encoding: base64 +Content-Disposition: attachment; filename="test.txt" + +VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA== + +--===============8484938434343225034==-- diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 689d6f3..8be2816 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -141,6 +141,20 @@ "髙橋淳,35,名古屋", ] +EML_TEST_STRINGS = [ + "## Email Headers", + "**From:** John Doe ", + "**To:** Jane Smith ", + "**Subject:** Test Email Document", + "**CC:** cc.person@example.com", + "## Email Content", + "This is a test email with multiple parts", + "- Plain text content", + "- An attachment", + "## Attachments", + "- test.txt (application/txt, 31 bytes)", +] + LLM_TEST_STRINGS = [ "5bda1dd6", ] @@ -224,6 +238,13 @@ def test_markitdown_local() -> None: result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.pptx")) validate_strings(result, PPTX_TEST_STRINGS) + # Test EML processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.eml")) + assert result.title == "Test Email Document" + for test_string in EML_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test HTML processing result = markitdown.convert( os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL