microsoft · 0xRaduan · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024 · Jan 9, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -42,6 +42,8 @@ dependencies = [
   "pathvalidate",
   "charset-normalizer",
   "openai",
+  "ebooklib",
+  "html2text>=2020.1.16",
 ]
 
 [project.urls]

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -26,6 +26,9 @@
 import pdfminer
 import pdfminer.high_level
 import pptx
+from ebooklib import epub, ITEM_DOCUMENT
+import html2text
+
 
 # File-format detection
 import puremagic
@@ -75,6 +78,10 @@ def __init__(self, **options: Any):
         # Explicitly cast options to the expected type if necessary
         super().__init__(**options)
 
+    def convert_em(self, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Convert emphasis tags (<em>) to markdown style (_text_)"""
+        return f"_{text}_" if text.strip() else ""
+
     def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
         """Same as usual, but be sure to start with a new line"""
         if not convert_as_inline:
@@ -696,6 +703,60 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         )
 
 
+class EpubConverter(DocumentConverter):
+    """Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
+
+    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
+        """Convert an EPUB file to markdown.
+
+        Args:
+            local_path: Path to the EPUB file
+            **kwargs: Additional arguments (unused)
+
+        Returns:
+            DocumentConverterResult containing the converted markdown
+
+        Raises:
+            FileConversionException: If the file is not an EPUB file
+        """
+        # Check if this is an EPUB file
+        file_ext = kwargs.get("file_extension", "").lower()
+        if not file_ext.endswith(".epub"):
+            return None
+
+        book = epub.read_epub(local_path)
+
+        # Initialize result with book title
+        result = DocumentConverterResult(
+            title=(
+                book.get_metadata("DC", "title")[0][0]
+                if book.get_metadata("DC", "title")
+                else None
+            )
+        )
+
+        # Start with metadata
+        metadata_md = []
+        if book.get_metadata("DC", "creator"):
+            metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
+        if book.get_metadata("DC", "description"):
+            metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
+
+        # Convert content
+        content_md = []
+        for item in book.get_items():
+            if item.get_type() == ITEM_DOCUMENT:
+                content = item.get_content().decode("utf-8")
+                html_result = HtmlConverter()._convert(content)
+                if html_result and html_result.text_content:
+                    content_md.append(html_result.text_content)
+
+        # Combine all parts
+        result.text_content = "\n\n".join(metadata_md + content_md)
+
+        return result
+
+
 class DocxConverter(HtmlConverter):
     """
     Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@@ -1405,6 +1466,7 @@ def __init__(
         self.register_page_converter(PdfConverter())
         self.register_page_converter(ZipConverter())
         self.register_page_converter(OutlookMsgConverter())
+        self.register_page_converter(EpubConverter())
 
     def convert(
         self, source: Union[str, requests.Response, Path], **kwargs: Any

diff --git a/tests/test_files/test.epub b/tests/test_files/test.epub
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -145,6 +145,30 @@
     "5bda1dd6",
 ]
 
+EPUB_TEST_STRINGS = [
+    "Author: Test Author",
+    "A test EPUB document for MarkItDown testing",
+    "# Chapter 1: Test Content",
+    "This is a **test** paragraph with some formatting",
+    "* A bullet point",
+    "* Another point",
+    "# Chapter 2: More Content",
+    "_different_ style",
+    "> This is a blockquote for testing",
+]
+
+EPUB_TEST_STRINGS = [
+    "Author: Test Author",
+    "A test EPUB document for MarkItDown testing",
+    "# Chapter 1: Test Content",
+    "This is a **test** paragraph with some formatting",
+    "* A bullet point",
+    "* Another point",
+    "# Chapter 2: More Content",
+    "_different_ style",
+    "> This is a blockquote for testing",
+]
+
 JSON_TEST_STRINGS = [
     "5b64c88c-b3c3-4510-bcb8-da0b200602d8",
     "9700dc99-6685-40b4-9a3a-5e406dcb37f3",
@@ -192,6 +216,13 @@ def test_markitdown_remote() -> None:
 def test_markitdown_local() -> None:
     markitdown = MarkItDown()
 
+    # Test EPUB processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
+    assert result.title == "Test EPUB Document"
+    for test_string in EPUB_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
     # Test XLSX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
     validate_strings(result, XLSX_TEST_STRINGS)