Skip to content

Commit

Permalink
update: adding named parameter pdf_engine to .conver(); adding test…
Browse files Browse the repository at this point in the history
… cases for pdf. Raised exceptions when pdf_engine is not valid.
  • Loading branch information
tungsten106 committed Dec 25, 2024
1 parent ba5df9b commit e808548
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 5 deletions.
8 changes: 4 additions & 4 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import traceback
import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union
from typing import Any, Dict, List, Optional, Union, Literal
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
Expand Down Expand Up @@ -680,7 +680,7 @@ class PdfConverter(DocumentConverter):
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]:
"""
Example:
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
Expand All @@ -690,13 +690,13 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf":
return None
pdf_engine = kwargs.get("pdf_engine", "pdfminer")
if pdf_engine == "pdfminer":
text_content = pdfminer.high_level.extract_text(local_path)
elif pdf_engine == "pymupdf4llm":
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
else:
return None # unknown method
# return None # unknown method
raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].")
return DocumentConverterResult(title=None, text_content=text_content)


Expand Down
Binary file added tests/test_files/2308.08155v2.pdf
Binary file not shown.
18 changes: 17 additions & 1 deletion tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
import requests

from warnings import catch_warnings, resetwarnings

import sys
sys.path.insert(0, "/home/yxl/Projects/markitdown/src")
from markitdown import MarkItDown

skip_remote = (
Expand Down Expand Up @@ -299,6 +300,20 @@ def test_markitdown_llm() -> None:
for test_string in ["red", "circle", "blue", "square"]:
assert test_string in result.text_content.lower()

def test_markitdown_pdf() -> None:
markitdown = MarkItDown()

# I test by local pdf, using PDF_TEST_URL may also be fine.

# By pymupdf4llm
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm")
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content

# By pdfminer
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer")
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content

if __name__ == "__main__":
"""Runs this file's tests from the command line."""
Expand All @@ -307,3 +322,4 @@ def test_markitdown_llm() -> None:
test_markitdown_exiftool()
test_markitdown_deprecation()
test_markitdown_llm()
test_markitdown_pdf()

0 comments on commit e808548

Please sign in to comment.