-
Notifications
You must be signed in to change notification settings - Fork 1.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
update: change pdf text parser to pymupdf4llm #139
base: main
Are you sure you want to change the base?
Changes from 7 commits
b3f7e00
df5f14e
263e0b5
d4d11a8
797e0d4
ba5df9b
e808548
565ef05
bd95fb0
8482477
f07ea3e
a1766c5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,7 +14,7 @@ | |
import traceback | ||
import zipfile | ||
from xml.dom import minidom | ||
from typing import Any, Dict, List, Optional, Union | ||
from typing import Any, Dict, List, Optional, Union, Literal | ||
from pathlib import Path | ||
from urllib.parse import parse_qs, quote, unquote, urlparse, urlunparse | ||
from warnings import warn, resetwarnings, catch_warnings | ||
|
@@ -24,6 +24,7 @@ | |
import pandas as pd | ||
import pdfminer | ||
import pdfminer.high_level | ||
import pymupdf4llm | ||
import pptx | ||
|
||
# File-format detection | ||
|
@@ -676,19 +677,27 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: | |
|
||
class PdfConverter(DocumentConverter): | ||
""" | ||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. | ||
Converts PDFs to Markdown. Most style information is ignored, so the results are essentially plain-text. | ||
""" | ||
|
||
def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: | ||
def convert(self, local_path, pdf_engine: Literal['pdfminer', 'pymupdf4llm']='pdfminer', **kwargs) -> Union[None, DocumentConverterResult]: | ||
""" | ||
Example: | ||
>>> source = "https://arxiv.org/pdf/2308.08155v2.pdf" | ||
>>> markitdown.convert(source, pdf_engine="pymupdf4llm") | ||
""" | ||
# Bail if not a PDF | ||
extension = kwargs.get("file_extension", "") | ||
if extension.lower() != ".pdf": | ||
return None | ||
|
||
return DocumentConverterResult( | ||
title=None, | ||
text_content=pdfminer.high_level.extract_text(local_path), | ||
) | ||
if pdf_engine == "pdfminer": | ||
text_content = pdfminer.high_level.extract_text(local_path) | ||
elif pdf_engine == "pymupdf4llm": | ||
text_content = pymupdf4llm.to_markdown(local_path, show_progress=False) | ||
else: | ||
# return None # unknown method | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove this |
||
raise FileConversionException("'pdf_engine' not valid. Please choose between ['pdfminer', 'pymupdf4llm'].") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd suggest checking the engine first. You could use _engines: Mapping[str, Any] = {
"pdfminer": pdfminer,
"pymupdf4llm": pymupdf4llm,
}
###
if engine is not None and engine not in self._engines:
raise There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is a good idea for the ease of adding further engines. I have updated those changes in the latest commit. An example of using |
||
return DocumentConverterResult(title=None, text_content=text_content) | ||
|
||
|
||
class DocxConverter(HtmlConverter): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,7 +7,8 @@ | |
import requests | ||
|
||
from warnings import catch_warnings, resetwarnings | ||
|
||
import sys | ||
sys.path.insert(0, "/home/yxl/Projects/markitdown/src") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think don't need this. Let me know if you need help setting up the test! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will remove this, it won't be used |
||
from markitdown import MarkItDown | ||
|
||
skip_remote = ( | ||
|
@@ -299,6 +300,20 @@ def test_markitdown_llm() -> None: | |
for test_string in ["red", "circle", "blue", "square"]: | ||
assert test_string in result.text_content.lower() | ||
|
||
def test_markitdown_pdf() -> None: | ||
markitdown = MarkItDown() | ||
|
||
# I test by local pdf, using PDF_TEST_URL may also be fine. | ||
|
||
# By pymupdf4llm | ||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pymupdf4llm") | ||
for test_string in PDF_TEST_STRINGS: | ||
assert test_string in result.text_content | ||
|
||
# By pdfminer | ||
result = markitdown.convert(os.path.join(TEST_FILES_DIR, "2308.08155v2.pdf"), pdf_engine="pdfminer") | ||
for test_string in PDF_TEST_STRINGS: | ||
assert test_string in result.text_content | ||
|
||
if __name__ == "__main__": | ||
"""Runs this file's tests from the command line.""" | ||
|
@@ -307,3 +322,4 @@ def test_markitdown_llm() -> None: | |
test_markitdown_exiftool() | ||
test_markitdown_deprecation() | ||
test_markitdown_llm() | ||
test_markitdown_pdf() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think
pdf_engine
should be a named parameter in the function instead of being accessed viakwargs
. This provides more clarity and ensures better handling of default valuesThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for you suggestion. I have added it to a named parameters ( I was meant to align with other converters' parameter definition), and added the exception case when
pdf_engine
is not valid. The new test cases could be seen on my new commit.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is optional, but adding one more named parameter could make this more customizable, like: