add support for EML

microsoft · Jan 9, 2025 · 68cc8aa · 68cc8aa
1 parent 1deaba1
commit 68cc8aa
Show file tree

Hide file tree

Showing 3 changed files with 148 additions and 0 deletions.
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -17,6 +17,9 @@
 from typing import Any, Dict, List, Optional, Union
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 from warnings import warn, resetwarnings, catch_warnings
+from email import policy
+from email.parser import Parser
+from email.utils import parseaddr
 
 import mammoth
 import markdownify
@@ -1075,6 +1078,96 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None
         return response.choices[0].message.content
 
 
+class EmlConverter(DocumentConverter):
+    """Converts EML (email) files to Markdown. Preserves headers, body, and attachments info."""
+
+    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
+        """Convert an EML file to markdown.
+
+        Args:
+            local_path: Path to the EML file
+            **kwargs: Additional arguments (unused)
+
+        Returns:
+            DocumentConverterResult containing the converted markdown
+        """
+        # Check if this is an EML file
+        file_ext = kwargs.get("file_extension", "").lower()
+        if not file_ext.endswith(".eml"):
+            return None
+
+        with open(local_path, "r", encoding="utf-8") as fp:
+            # Use policy=default to handle RFC compliant emails
+            msg = Parser(policy=policy.default).parse(fp)
+
+        # Initialize result with email subject as title
+        result = DocumentConverterResult(title=msg.get("subject", "Untitled Email"))
+
+        # Build markdown content
+        md_parts = []
+
+        # Add email headers
+        md_parts.append("## Email Headers\n")
+
+        # From and To in a more readable format
+        from_name, from_email = parseaddr(msg.get("from", ""))
+        to_name, to_email = parseaddr(msg.get("to", ""))
+
+        md_parts.append(
+            f"**From:** {from_name} <{from_email}>"
+            if from_name
+            else f"**From:** {from_email}"
+        )
+        md_parts.append(
+            f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}"
+        )
+        md_parts.append(f"**Subject:** {msg.get('subject', '')}")
+        md_parts.append(f"**Date:** {msg.get('date', '')}")
+
+        # Add CC if present
+        if msg.get("cc"):
+            md_parts.append(f"**CC:** {msg.get('cc')}")
+
+        md_parts.append("\n## Email Content\n")
+
+        # Handle the email body
+        if msg.is_multipart():
+            for part in msg.walk():
+                if part.get_content_type() == "text/plain":
+                    md_parts.append(part.get_content())
+                elif part.get_content_type() == "text/html":
+                    # If we have HTML content but no plain text, we could convert HTML to markdown here
+                    # For now, we'll just note it's HTML content
+                    if not any(
+                        p.get_content_type() == "text/plain" for p in msg.walk()
+                    ):
+                        md_parts.append(part.get_content())
+        else:
+            md_parts.append(msg.get_content())
+
+        # List attachments if any
+        attachments = []
+        if msg.is_multipart():
+            for part in msg.walk():
+                if part.get_content_disposition() == "attachment":
+                    filename = part.get_filename()
+                    if filename:
+                        size = len(part.get_content())
+                        mime_type = part.get_content_type()
+                        attachments.append(
+                            f"- {filename} ({mime_type}, {size:,} bytes)"
+                        )
+
+        if attachments:
+            md_parts.append("\n## Attachments\n")
+            md_parts.extend(attachments)
+
+        # Combine all parts
+        result.text_content = "\n".join(md_parts)
+
+        return result
+
+
 class ZipConverter(DocumentConverter):
     """Converts ZIP files to markdown by extracting and converting all contained files.
 
@@ -1273,6 +1366,7 @@ def __init__(
         self.register_page_converter(IpynbConverter())
         self.register_page_converter(PdfConverter())
         self.register_page_converter(ZipConverter())
+        self.register_page_converter(EmlConverter())
 
     def convert(
         self, source: Union[str, requests.Response], **kwargs: Any

diff --git a/tests/test_files/test.eml b/tests/test_files/test.eml
@@ -0,0 +1,33 @@
+Content-Type: multipart/mixed; boundary="===============8484938434343225034=="
+MIME-Version: 1.0
+Subject: Test Email Document
+From: John Doe <[email protected]>
+To: Jane Smith <[email protected]>
+Date: Wed, 18 Dec 2024 10:00:00 +0000
+CC: [email protected]
+
+--===============8484938434343225034==
+Content-Type: text/plain; charset="us-ascii"
+MIME-Version: 1.0
+Content-Transfer-Encoding: 7bit
+
+
+This is a test email with multiple parts.
+
+It contains:
+- Plain text content
+- An attachment
+- Various headers
+
+Best regards,
+John Doe
+
+--===============8484938434343225034==
+Content-Type: application/txt
+MIME-Version: 1.0
+Content-Transfer-Encoding: base64
+Content-Disposition: attachment; filename="test.txt"
+
+VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA==
+
+--===============8484938434343225034==--
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -126,6 +126,20 @@
     "髙橋淳,35,名古屋",
 ]
 
+EML_TEST_STRINGS = [
+    "## Email Headers",
+    "**From:** John Doe <[email protected]>",
+    "**To:** Jane Smith <[email protected]>",
+    "**Subject:** Test Email Document",
+    "**CC:** [email protected]",
+    "## Email Content",
+    "This is a test email with multiple parts",
+    "- Plain text content",
+    "- An attachment",
+    "## Attachments",
+    "- test.txt (application/txt, 31 bytes)",
+]
+
 LLM_TEST_STRINGS = [
     "5bda1dd6",
 ]
@@ -197,6 +211,13 @@ def test_markitdown_local() -> None:
         text_content = result.text_content.replace("\\", "")
         assert test_string in text_content
 
+    # Test EML processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.eml"))
+    assert result.title == "Test Email Document"
+    for test_string in EML_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
     # Test HTML processing
     result = markitdown.convert(
         os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL