Skip to content

Commit

Permalink
update: Addengine_kwargs for customize parameters. Update PdfConver…
Browse files Browse the repository at this point in the history
…ter engines calling method for easier to add more engines. Examples of using `engine_kwargs` to extract pdf images added
  • Loading branch information
tungsten106 committed Dec 26, 2024
1 parent e808548 commit 565ef05
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 15 deletions.
29 changes: 20 additions & 9 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import traceback
import zipfile
from xml.dom import minidom
from typing import Any, Dict, List, Optional, Union, Literal
from typing import Any, Dict, List, Optional, Union, Literal, Mapping
from pathlib import Path
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse
from warnings import warn, resetwarnings, catch_warnings
Expand Down Expand Up @@ -679,24 +679,35 @@ class PdfConverter(DocumentConverter):
"""
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text.
"""
_engines: Mapping[str, Any] = {
"pdfminer": pdfminer.high_level.extract_text,
"pymupdf4llm": pymupdf4llm.to_markdown,
}

def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]:
def convert(
self,
local_path,
engine: Literal["pdfminer", "pymupdf4llm"] = "pdfminer",
engine_kwargs={},
**kwargs,
) -> Union[None, DocumentConverterResult]:
"""
Example:
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf"
>>> markitdown.convert(source, pdf_engine="pymupdf4llm")
>>> markitdown.convert(source, engine="pymupdf4llm")
"""
# Bail if not a PDF
extension = kwargs.get("file_extension", "")
if extension.lower() != ".pdf":
return None
if pdf_engine == "pdfminer":
text_content = pdfminer.high_level.extract_text(local_path)
elif pdf_engine == "pymupdf4llm":
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False)
if engine is not None and engine not in self._engines:
raise FileConversionException(
"'pdf_engine' not valid. Please choose between {}.".format(
list(self._engines.keys())
)
)
else:
# return None # unknown method
raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].")
text_content = self._engines[engine](local_path, **engine_kwargs)
return DocumentConverterResult(title=None, text_content=text_content)


Expand Down
29 changes: 23 additions & 6 deletions tests/test_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@
import requests

from warnings import catch_warnings, resetwarnings
import sys
sys.path.insert(0, "/home/yxl/Projects/markitdown/src")
from markitdown import MarkItDown

skip_remote = (
Expand Down Expand Up @@ -302,16 +300,35 @@ def test_markitdown_llm() -> None:

def test_markitdown_pdf() -> None:
markitdown = MarkItDown()

# I test by local pdf, using PDF_TEST_URL may also be fine.

# By pymupdf4llm
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm")
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
engine="pymupdf4llm",
engine_kwargs={"show_progress": False}, # additional kwargs
)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content

# By pymupdf4llm and extract images
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"),
engine="pymupdf4llm",
engine_kwargs={
"show_progress": False,
"write_images": True,
"image_path": "tests/pics",
}, # `write_images` must be True, setting `image_path` for images saving dir.
)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content

# By pdfminer
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer")
result = markitdown.convert(
os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), engine="pdfminer"
)
for test_string in PDF_TEST_STRINGS:
assert test_string in result.text_content

Expand Down

0 comments on commit 565ef05

Please sign in to comment.