Skip to content

Commit

Permalink
fix: add html escape in md export and fix formula escapes (#143)
Browse files Browse the repository at this point in the history
* add html escape in md export and fix formula escapes

Signed-off-by: Michele Dolfi <[email protected]>

* escape also html export

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Jan 31, 2025
1 parent 0519d50 commit c6590e8
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 66 deletions.
142 changes: 83 additions & 59 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import base64
import copy
import hashlib
import html
import json
import mimetypes
import os
Expand Down Expand Up @@ -1045,7 +1046,7 @@ def export_to_html(

text = ""
if doc is not None and add_caption and len(self.captions):
text = self.caption_text(doc)
text = html.escape(self.caption_text(doc))

if len(self.data.table_cells) == 0:
return ""
Expand All @@ -1071,7 +1072,7 @@ def export_to_html(
if colstart != j:
continue

content = cell.text.strip()
content = html.escape(cell.text.strip())
celltag = "td"
if cell.column_header:
celltag = "th"
Expand Down Expand Up @@ -2082,6 +2083,46 @@ def export_to_markdown( # noqa: C901
previous_level = 0 # Track the previous item's level
in_list = False # Track if we're currently processing list items

# Our export markdown doesn't contain any emphasis styling:
# Bold, Italic, or Bold-Italic
# Hence, any underscore that we print into Markdown is coming from document text
# That means we need to escape it, to properly reflect content in the markdown
# However, we need to preserve underscores in image URLs
# to maintain their validity
# For example: ![image](path/to_image.png) should remain unchanged
def _escape_underscores(text):
"""Escape underscores but leave them intact in the URL.."""
# Firstly, identify all the URL patterns.
url_pattern = r"!\[.*?\]\((.*?)\)"
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
combined_pattern = f"({url_pattern})|({latex_pattern})"

parts = []
last_end = 0

for match in re.finditer(combined_pattern, text):
# Text to add before the URL (needs to be escaped)
before_url = text[last_end : match.start()]
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))

# Add the full URL part (do not escape)
parts.append(match.group(0))
last_end = match.end()

# Add the final part of the text (which needs to be escaped)
if last_end < len(text):
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))

return "".join(parts)

def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
if do_escape_underscores and escaping_underscores:
text = _escape_underscores(text)
if do_escape_html:
text = html.escape(text, quote=False)
mdtexts.append(text)

for ix, (item, level) in enumerate(
self.iterate_items(self.body, with_groups=True, page_no=page_no)
):
Expand Down Expand Up @@ -2130,7 +2171,7 @@ def export_to_markdown( # noqa: C901
in_list = False
marker = "" if strict_text else "#"
text = f"{marker} {item.text}"
mdtexts.append(text.strip() + "\n")
_append_text(text.strip() + "\n")

elif (
isinstance(item, TextItem)
Expand All @@ -2143,12 +2184,12 @@ def export_to_markdown( # noqa: C901
if len(marker) < 2:
marker = "##"
text = f"{marker} {item.text}\n"
mdtexts.append(text.strip() + "\n")
_append_text(text.strip() + "\n")

elif isinstance(item, CodeItem) and item.label in labels:
in_list = False
text = f"```\n{item.text}\n```\n"
mdtexts.append(text)
_append_text(text, do_escape_underscores=False, do_escape_html=False)

elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
in_list = True
Expand All @@ -2165,85 +2206,54 @@ def export_to_markdown( # noqa: C901
marker = "-" # Markdown needs only dash as item marker.

text = f"{list_indent}{marker} {item.text}"
mdtexts.append(text)
_append_text(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
in_list = False
mdtexts.append(f"$${item.text}$$\n")
_append_text(
f"$${item.text}$$\n",
do_escape_underscores=False,
do_escape_html=False,
)

elif isinstance(item, TextItem) and item.label in labels:
in_list = False
if len(item.text) and text_width > 0:
text = item.text
wrapped_text = textwrap.fill(text, width=text_width)
mdtexts.append(wrapped_text + "\n")
_append_text(wrapped_text + "\n")
elif len(item.text):
text = f"{item.text}\n"
mdtexts.append(text)
_append_text(text)

elif isinstance(item, TableItem) and not strict_text:
in_list = False
mdtexts.append(item.caption_text(self))
_append_text(item.caption_text(self))
md_table = item.export_to_markdown()
mdtexts.append("\n" + md_table + "\n")
_append_text("\n" + md_table + "\n")

elif isinstance(item, PictureItem) and not strict_text:
in_list = False
mdtexts.append(item.caption_text(self))
_append_text(item.caption_text(self))

line = item.export_to_markdown(
doc=self,
image_placeholder=image_placeholder,
image_mode=image_mode,
)

mdtexts.append(line)
_append_text(line, do_escape_html=False, do_escape_underscores=False)

elif isinstance(item, DocItem) and item.label in labels:
in_list = False
text = "<missing-text>"
mdtexts.append(text)
text = "<!-- missing-text -->"
_append_text(text, do_escape_html=False, do_escape_underscores=False)

mdtext = (delim.join(mdtexts)).strip()
mdtext = re.sub(
r"\n\n\n+", "\n\n", mdtext
) # remove cases of double or more empty lines.

# Our export markdown doesn't contain any emphasis styling:
# Bold, Italic, or Bold-Italic
# Hence, any underscore that we print into Markdown is coming from document text
# That means we need to escape it, to properly reflect content in the markdown
# However, we need to preserve underscores in image URLs
# to maintain their validity
# For example: ![image](path/to_image.png) should remain unchanged
def escape_underscores(text):
"""Escape underscores but leave them intact in the URL.."""
# Firstly, identify all the URL patterns.
url_pattern = r"!\[.*?\]\((.*?)\)"
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
combined_pattern = f"({url_pattern})|({latex_pattern})"

parts = []
last_end = 0

for match in re.finditer(combined_pattern, text):
# Text to add before the URL (needs to be escaped)
before_url = text[last_end : match.start()]
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))

# Add the full URL part (do not escape)
parts.append(match.group(0))
last_end = match.end()

# Add the final part of the text (which needs to be escaped)
if last_end < len(text):
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))

return "".join(parts)

if escaping_underscores:
mdtext = escape_underscores(mdtext)

return mdtext

def export_to_text( # noqa: C901
Expand Down Expand Up @@ -2371,6 +2381,11 @@ def close_lists(

in_ordered_list: List[bool] = [] # False

def _sanitize_text(text: str, do_escape_html=True) -> str:
if do_escape_html:
text = html.escape(text, quote=False)
return text

for ix, (item, curr_level) in enumerate(
self.iterate_items(self.body, with_groups=True, page_no=page_no)
):
Expand Down Expand Up @@ -2421,14 +2436,17 @@ def close_lists(

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:

text = f"<h1>{item.text}</h1>"
text = f"<h1>{_sanitize_text(item.text)}</h1>"
html_texts.append(text.strip())

elif isinstance(item, SectionHeaderItem):

section_level: int = item.level + 1

text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
text = (
f"<h{(section_level)}>"
f"{_sanitize_text(item.text)}</h{(section_level)}>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [
Expand All @@ -2443,31 +2461,37 @@ def close_lists(
if section_level >= 6:
section_level = 6

text = f"<h{section_level}>{item.text}</h{section_level}>"
text = (
f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:

text = f"<pre>{item.text}</pre>"
text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
html_texts.append(text)

elif isinstance(item, ListItem):

text = f"<li>{item.text}</li>"
text = f"<li>{_sanitize_text(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:

text = f"<li>{item.text}</li>"
text = f"<li>{_sanitize_text(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, CodeItem) and item.label in labels:
text = f"<pre><code>{item.text}</code></pre>"
text = (
"<pre><code>"
f"{_sanitize_text(item.text, do_escape_html=False)}"
"</code></pre>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in labels:

text = f"<p>{item.text}</p>"
text = f"<p>{_sanitize_text(item.text)}</p>"
html_texts.append(text.strip())
elif isinstance(item, TableItem):

Expand Down
Loading

0 comments on commit c6590e8

Please sign in to comment.