Skip to content

Commit

Permalink
add support for EML
Browse files Browse the repository at this point in the history
  • Loading branch information
0xRaduan committed Jan 9, 2025
1 parent 1deaba1 commit 68cc8aa
Show file tree
Hide file tree
Showing 3 changed files with 148 additions and 0 deletions.
94 changes: 94 additions & 0 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
from typing import Any, Dict, List, Optional, Union
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
from email import policy
from email.parser import Parser
from email.utils import parseaddr

import mammoth
import markdownify
Expand Down Expand Up @@ -1075,6 +1078,96 @@ def _get_llm_description(self, local_path, extension, client, model, prompt=None
return response.choices[0].message.content


class EmlConverter(DocumentConverter):
"""Converts EML (email) files to Markdown. Preserves headers, body, and attachments info."""

def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
"""Convert an EML file to markdown.
Args:
local_path: Path to the EML file
**kwargs: Additional arguments (unused)
Returns:
DocumentConverterResult containing the converted markdown
"""
# Check if this is an EML file
file_ext = kwargs.get("file_extension", "").lower()
if not file_ext.endswith(".eml"):
return None

with open(local_path, "r", encoding="utf-8") as fp:
# Use policy=default to handle RFC compliant emails
msg = Parser(policy=policy.default).parse(fp)

# Initialize result with email subject as title
result = DocumentConverterResult(title=msg.get("subject", "Untitled Email"))

# Build markdown content
md_parts = []

# Add email headers
md_parts.append("## Email Headers\n")

# From and To in a more readable format
from_name, from_email = parseaddr(msg.get("from", ""))
to_name, to_email = parseaddr(msg.get("to", ""))

md_parts.append(
f"**From:** {from_name} <{from_email}>"
if from_name
else f"**From:** {from_email}"
)
md_parts.append(
f"**To:** {to_name} <{to_email}>" if to_name else f"**To:** {to_email}"
)
md_parts.append(f"**Subject:** {msg.get('subject', '')}")
md_parts.append(f"**Date:** {msg.get('date', '')}")

# Add CC if present
if msg.get("cc"):
md_parts.append(f"**CC:** {msg.get('cc')}")

md_parts.append("\n## Email Content\n")

# Handle the email body
if msg.is_multipart():
for part in msg.walk():
if part.get_content_type() == "text/plain":
md_parts.append(part.get_content())
elif part.get_content_type() == "text/html":
# If we have HTML content but no plain text, we could convert HTML to markdown here
# For now, we'll just note it's HTML content
if not any(
p.get_content_type() == "text/plain" for p in msg.walk()
):
md_parts.append(part.get_content())
else:
md_parts.append(msg.get_content())

# List attachments if any
attachments = []
if msg.is_multipart():
for part in msg.walk():
if part.get_content_disposition() == "attachment":
filename = part.get_filename()
if filename:
size = len(part.get_content())
mime_type = part.get_content_type()
attachments.append(
f"- {filename} ({mime_type}, {size:,} bytes)"
)

if attachments:
md_parts.append("\n## Attachments\n")
md_parts.extend(attachments)

# Combine all parts
result.text_content = "\n".join(md_parts)

return result


class ZipConverter(DocumentConverter):
"""Converts ZIP files to markdown by extracting and converting all contained files.
Expand Down Expand Up @@ -1273,6 +1366,7 @@ def __init__(
self.register_page_converter(IpynbConverter())
self.register_page_converter(PdfConverter())
self.register_page_converter(ZipConverter())
self.register_page_converter(EmlConverter())

def convert(
self, source: Union[str, requests.Response], **kwargs: Any
Expand Down
33 changes: 33 additions & 0 deletions tests/test_files/test.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
Content-Type: multipart/mixed; boundary="===============8484938434343225034=="
MIME-Version: 1.0
Subject: Test Email Document
From: John Doe <[email protected]>
To: Jane Smith <[email protected]>
Date: Wed, 18 Dec 2024 10:00:00 +0000
CC: [email protected]

--===============8484938434343225034==
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
This is a test email with multiple parts.
It contains:
- Plain text content
- An attachment
- Various headers
Best regards,
John Doe
--===============8484938434343225034==
Content-Type: application/txt
MIME-Version: 1.0
Content-Transfer-Encoding: base64
Content-Disposition: attachment; filename="test.txt"
VGhpcyBpcyB0ZXN0IGF0dGFjaG1lbnQgY29udGVudA==
--===============8484938434343225034==--
21 changes: 21 additions & 0 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,20 @@
"髙橋淳,35,名古屋",
]

EML_TEST_STRINGS = [
"## Email Headers",
"**From:** John Doe <[email protected]>",
"**To:** Jane Smith <[email protected]>",
"**Subject:** Test Email Document",
"**CC:** [email protected]",
"## Email Content",
"This is a test email with multiple parts",
"- Plain text content",
"- An attachment",
"## Attachments",
"- test.txt (application/txt, 31 bytes)",
]

LLM_TEST_STRINGS = [
"5bda1dd6",
]
Expand Down Expand Up @@ -197,6 +211,13 @@ def test_markitdown_local() -> None:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content

# Test EML processing
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.eml"))
assert result.title == "Test Email Document"
for test_string in EML_TEST_STRINGS:
text_content = result.text_content.replace("\\", "")
assert test_string in text_content

# Test HTML processing
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "test_blog.html"), url=BLOG_TEST_URL
Expand Down

0 comments on commit 68cc8aa

Please sign in to comment.