Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add options to keep data uris #272

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 44 additions & 15 deletions src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ class _CustomMarkdownify(markdownify.MarkdownConverter):

def __init__(self, **options: Any):
options["heading_style"] = options.get("heading_style", markdownify.ATX)
self.keep_data_uris = options.pop("keep_data_uris", False)
# Explicitly cast options to the expected type if necessary
super().__init__(**options)

Expand Down Expand Up @@ -133,10 +134,10 @@ def convert_img(self, el: Any, text: str, convert_as_inline: bool) -> str:
return alt

# Remove dataURIs
if src.startswith("data:"):
if not self.keep_data_uris and src.startswith("data:"):
src = src.split(",")[0] + "..."

return "![%s](%s%s)" % (alt, src, title_part)
return "![%s%s](%s)" % (alt, title_part, src)

def convert_soup(self, soup: Any) -> str:
return super().convert_soup(soup) # type: ignore
Expand Down Expand Up @@ -189,6 +190,10 @@ def convert(
class HtmlConverter(DocumentConverter):
"""Anything with content type text/html"""

def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down Expand Up @@ -217,9 +222,13 @@ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
body_elm = soup.find("body")
webpage_text = ""
if body_elm:
webpage_text = _CustomMarkdownify().convert_soup(body_elm)
webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify().convert_soup(soup)
webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(soup)

assert isinstance(webpage_text, str)

Expand All @@ -232,6 +241,10 @@ def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]:
class RSSConverter(DocumentConverter):
"""Convert RSS / Atom type to markdown"""

def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()

def convert(
self, local_path: str, **kwargs
) -> Union[None, DocumentConverterResult]:
Expand Down Expand Up @@ -347,7 +360,9 @@ def _parse_content(self, content: str) -> str:
try:
# using bs4 because many RSS feeds have HTML-styled content
soup = BeautifulSoup(content, "html.parser")
return _CustomMarkdownify().convert_soup(soup)
return _CustomMarkdownify(keep_data_uris=self.keep_data_uris).convert_soup(
soup
)
except BaseException as _:
return content

Expand All @@ -369,6 +384,10 @@ def _get_data_by_tag_name(
class WikipediaConverter(DocumentConverter):
"""Handle Wikipedia pages separately, focusing only on the main document content."""

def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()

def convert(
self, local_path: str, **kwargs: Any
) -> Union[None, DocumentConverterResult]:
Expand Down Expand Up @@ -403,11 +422,13 @@ def convert(
assert isinstance(main_title, str)

# Convert the page
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify().convert_soup(
body_elm
)
webpage_text = f"# {main_title}\n\n" + _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(body_elm)
else:
webpage_text = _CustomMarkdownify().convert_soup(soup)
webpage_text = _CustomMarkdownify(
keep_data_uris=self.keep_data_uris
).convert_soup(soup)

return DocumentConverterResult(
title=main_title,
Expand Down Expand Up @@ -609,6 +630,10 @@ def _convert(self, notebook_content: dict) -> Union[None, DocumentConverterResul


class BingSerpConverter(DocumentConverter):
def __init__(self, keep_data_uris: Optional[bool] = False):
self.keep_data_uris = keep_data_uris
super().__init__()

"""
Handle Bing results pages (only the organic search results).
NOTE: It is better to use the Bing API
Expand Down Expand Up @@ -640,7 +665,7 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
slug.extract()

# Parse the algorithmic results
_markdownify = _CustomMarkdownify()
_markdownify = _CustomMarkdownify(keep_data_uris=self.keep_data_uris)
results = list()
for result in soup.find_all(class_="b_algo"):
# Rewrite redirect urls
Expand Down Expand Up @@ -701,6 +726,9 @@ class DocxConverter(HtmlConverter):
Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
"""

def __init__(self, keep_data_uris: Optional[bool] = False):
super().__init__(keep_data_uris=keep_data_uris)

def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
# Bail if not a DOCX
extension = kwargs.get("file_extension", "")
Expand Down Expand Up @@ -1337,6 +1365,7 @@ def __init__(
llm_model: Optional[str] = None,
style_map: Optional[str] = None,
exiftool_path: Optional[str] = None,
keep_data_uris: Optional[bool] = False,
# Deprecated
mlm_client: Optional[Any] = None,
mlm_model: Optional[str] = None,
Expand Down Expand Up @@ -1389,12 +1418,12 @@ def __init__(
# Later registrations are tried first / take higher priority than earlier registrations
# To this end, the most specific converters should appear below the most generic converters
self.register_page_converter(PlainTextConverter())
self.register_page_converter(HtmlConverter())
self.register_page_converter(RSSConverter())
self.register_page_converter(WikipediaConverter())
self.register_page_converter(HtmlConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(RSSConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(WikipediaConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(YouTubeConverter())
self.register_page_converter(BingSerpConverter())
self.register_page_converter(DocxConverter())
self.register_page_converter(BingSerpConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(DocxConverter(keep_data_uris=keep_data_uris))
self.register_page_converter(XlsxConverter())
self.register_page_converter(XlsConverter())
self.register_page_converter(PptxConverter())
Expand Down