update: adding named parameter pdf_engine to .conver(); adding test…

… cases for pdf. Raised exceptions when pdf_engine is not valid.
microsoft · Dec 25, 2024 · e808548 · e808548
1 parent ba5df9b
commit e808548
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 5 deletions.
diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -14,7 +14,7 @@
 import traceback
 import zipfile
 from xml.dom import minidom
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, Optional, Union, Literal
 from pathlib import Path
 from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
 from warnings import warn, resetwarnings, catch_warnings
@@ -680,7 +680,7 @@ class PdfConverter(DocumentConverter):
     Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.    
     """
 
-    def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
+    def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]:
         """
         Example:
         >>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
@@ -690,13 +690,13 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         extension = kwargs.get("file_extension", "")
         if extension.lower() != ".pdf":
             return None
-        pdf_engine = kwargs.get("pdf_engine", "pdfminer")
         if pdf_engine == "pdfminer":
             text_content = pdfminer.high_level.extract_text(local_path)
         elif pdf_engine == "pymupdf4llm":
             text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
         else:
-            return None     # unknown method
+            # return None     # unknown method
+            raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].")
         return DocumentConverterResult(title=None, text_content=text_content)
 
 

diff --git a/tests/test_files/2308.08155v2.pdf b/tests/test_files/2308.08155v2.pdf
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -7,7 +7,8 @@
 import requests
 
 from warnings import catch_warnings, resetwarnings
-
+import sys
+sys.path.insert(0, "/home/yxl/Projects/markitdown/src")
 from markitdown import MarkItDown
 
 skip_remote = (
@@ -299,6 +300,20 @@ def test_markitdown_llm() -> None:
     for test_string in ["red", "circle", "blue", "square"]:
         assert test_string in result.text_content.lower()
 
+def test_markitdown_pdf() -> None:
+    markitdown = MarkItDown()
+
+    # I test by local pdf, using PDF_TEST_URL may also be fine.
+
+    # By pymupdf4llm
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm")
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
+
+    # By pdfminer
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer")
+    for test_string in PDF_TEST_STRINGS:
+        assert test_string in result.text_content
 
 if __name__ == "__main__":
     """Runs this file's tests from the command line."""
@@ -307,3 +322,4 @@ def test_markitdown_llm() -> None:
     test_markitdown_exiftool()
     test_markitdown_deprecation()
     test_markitdown_llm()
+    test_markitdown_pdf()