fix: add html escape in md export and fix formula escapes (#143)

* add html escape in md export and fix formula escapes Signed-off-by: Michele Dolfi <[email protected]> * escape also html export Signed-off-by: Michele Dolfi <[email protected]> --------- Signed-off-by: Michele Dolfi <[email protected]>
DS4SD · Jan 31, 2025 · c6590e8 · c6590e8
1 parent 0519d50
commit c6590e8
Show file tree

Hide file tree

Showing 4 changed files with 90 additions and 66 deletions.
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -3,6 +3,7 @@
 import base64
 import copy
 import hashlib
+import html
 import json
 import mimetypes
 import os
@@ -1045,7 +1046,7 @@ def export_to_html(
 
         text = ""
         if doc is not None and add_caption and len(self.captions):
-            text = self.caption_text(doc)
+            text = html.escape(self.caption_text(doc))
 
         if len(self.data.table_cells) == 0:
             return ""
@@ -1071,7 +1072,7 @@ def export_to_html(
                 if colstart != j:
                     continue
 
-                content = cell.text.strip()
+                content = html.escape(cell.text.strip())
                 celltag = "td"
                 if cell.column_header:
                     celltag = "th"
@@ -2082,6 +2083,46 @@ def export_to_markdown(  # noqa: C901
         previous_level = 0  # Track the previous item's level
         in_list = False  # Track if we're currently processing list items
 
+        # Our export markdown doesn't contain any emphasis styling:
+        # Bold, Italic, or Bold-Italic
+        # Hence, any underscore that we print into Markdown is coming from document text
+        # That means we need to escape it, to properly reflect content in the markdown
+        # However, we need to preserve underscores in image URLs
+        # to maintain their validity
+        # For example: ![image](path/to_image.png) should remain unchanged
+        def _escape_underscores(text):
+            """Escape underscores but leave them intact in the URL.."""
+            # Firstly, identify all the URL patterns.
+            url_pattern = r"!\[.*?\]\((.*?)\)"
+            # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
+            latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
+            combined_pattern = f"({url_pattern})|({latex_pattern})"
+
+            parts = []
+            last_end = 0
+
+            for match in re.finditer(combined_pattern, text):
+                # Text to add before the URL (needs to be escaped)
+                before_url = text[last_end : match.start()]
+                parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
+
+                # Add the full URL part (do not escape)
+                parts.append(match.group(0))
+                last_end = match.end()
+
+            # Add the final part of the text (which needs to be escaped)
+            if last_end < len(text):
+                parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
+
+            return "".join(parts)
+
+        def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
+            if do_escape_underscores and escaping_underscores:
+                text = _escape_underscores(text)
+            if do_escape_html:
+                text = html.escape(text, quote=False)
+            mdtexts.append(text)
+
         for ix, (item, level) in enumerate(
             self.iterate_items(self.body, with_groups=True, page_no=page_no)
         ):
@@ -2130,7 +2171,7 @@ def export_to_markdown(  # noqa: C901
                 in_list = False
                 marker = "" if strict_text else "#"
                 text = f"{marker} {item.text}"
-                mdtexts.append(text.strip() + "\n")
+                _append_text(text.strip() + "\n")
 
             elif (
                 isinstance(item, TextItem)
@@ -2143,12 +2184,12 @@ def export_to_markdown(  # noqa: C901
                     if len(marker) < 2:
                         marker = "##"
                 text = f"{marker} {item.text}\n"
-                mdtexts.append(text.strip() + "\n")
+                _append_text(text.strip() + "\n")
 
             elif isinstance(item, CodeItem) and item.label in labels:
                 in_list = False
                 text = f"```\n{item.text}\n```\n"
-                mdtexts.append(text)
+                _append_text(text, do_escape_underscores=False, do_escape_html=False)
 
             elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
                 in_list = True
@@ -2165,85 +2206,54 @@ def export_to_markdown(  # noqa: C901
                     marker = "-"  # Markdown needs only dash as item marker.
 
                 text = f"{list_indent}{marker} {item.text}"
-                mdtexts.append(text)
+                _append_text(text)
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
                 in_list = False
-                mdtexts.append(f"$${item.text}$$\n")
+                _append_text(
+                    f"$${item.text}$$\n",
+                    do_escape_underscores=False,
+                    do_escape_html=False,
+                )
 
             elif isinstance(item, TextItem) and item.label in labels:
                 in_list = False
                 if len(item.text) and text_width > 0:
+                    text = item.text
                     wrapped_text = textwrap.fill(text, width=text_width)
-                    mdtexts.append(wrapped_text + "\n")
+                    _append_text(wrapped_text + "\n")
                 elif len(item.text):
                     text = f"{item.text}\n"
-                    mdtexts.append(text)
+                    _append_text(text)
 
             elif isinstance(item, TableItem) and not strict_text:
                 in_list = False
-                mdtexts.append(item.caption_text(self))
+                _append_text(item.caption_text(self))
                 md_table = item.export_to_markdown()
-                mdtexts.append("\n" + md_table + "\n")
+                _append_text("\n" + md_table + "\n")
 
             elif isinstance(item, PictureItem) and not strict_text:
                 in_list = False
-                mdtexts.append(item.caption_text(self))
+                _append_text(item.caption_text(self))
 
                 line = item.export_to_markdown(
                     doc=self,
                     image_placeholder=image_placeholder,
                     image_mode=image_mode,
                 )
 
-                mdtexts.append(line)
+                _append_text(line, do_escape_html=False, do_escape_underscores=False)
 
             elif isinstance(item, DocItem) and item.label in labels:
                 in_list = False
-                text = "<missing-text>"
-                mdtexts.append(text)
+                text = "<!-- missing-text -->"
+                _append_text(text, do_escape_html=False, do_escape_underscores=False)
 
         mdtext = (delim.join(mdtexts)).strip()
         mdtext = re.sub(
             r"\n\n\n+", "\n\n", mdtext
         )  # remove cases of double or more empty lines.
 
-        # Our export markdown doesn't contain any emphasis styling:
-        # Bold, Italic, or Bold-Italic
-        # Hence, any underscore that we print into Markdown is coming from document text
-        # That means we need to escape it, to properly reflect content in the markdown
-        # However, we need to preserve underscores in image URLs
-        # to maintain their validity
-        # For example: ![image](path/to_image.png) should remain unchanged
-        def escape_underscores(text):
-            """Escape underscores but leave them intact in the URL.."""
-            # Firstly, identify all the URL patterns.
-            url_pattern = r"!\[.*?\]\((.*?)\)"
-            # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
-            latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
-            combined_pattern = f"({url_pattern})|({latex_pattern})"
-
-            parts = []
-            last_end = 0
-
-            for match in re.finditer(combined_pattern, text):
-                # Text to add before the URL (needs to be escaped)
-                before_url = text[last_end : match.start()]
-                parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
-
-                # Add the full URL part (do not escape)
-                parts.append(match.group(0))
-                last_end = match.end()
-
-            # Add the final part of the text (which needs to be escaped)
-            if last_end < len(text):
-                parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
-
-            return "".join(parts)
-
-        if escaping_underscores:
-            mdtext = escape_underscores(mdtext)
-
         return mdtext
 
     def export_to_text(  # noqa: C901
@@ -2371,6 +2381,11 @@ def close_lists(
 
         in_ordered_list: List[bool] = []  # False
 
+        def _sanitize_text(text: str, do_escape_html=True) -> str:
+            if do_escape_html:
+                text = html.escape(text, quote=False)
+            return text
+
         for ix, (item, curr_level) in enumerate(
             self.iterate_items(self.body, with_groups=True, page_no=page_no)
         ):
@@ -2421,14 +2436,17 @@ def close_lists(
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:
 
-                text = f"<h1>{item.text}</h1>"
+                text = f"<h1>{_sanitize_text(item.text)}</h1>"
                 html_texts.append(text.strip())
 
             elif isinstance(item, SectionHeaderItem):
 
                 section_level: int = item.level + 1
 
-                text = f"<h{(section_level)}>{item.text}</h{(section_level)}>"
+                text = (
+                    f"<h{(section_level)}>"
+                    f"{_sanitize_text(item.text)}</h{(section_level)}>"
+                )
                 html_texts.append(text.strip())
 
             elif isinstance(item, TextItem) and item.label in [
@@ -2443,31 +2461,37 @@ def close_lists(
                 if section_level >= 6:
                     section_level = 6
 
-                text = f"<h{section_level}>{item.text}</h{section_level}>"
+                text = (
+                    f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
+                )
                 html_texts.append(text.strip())
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
 
-                text = f"<pre>{item.text}</pre>"
+                text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
                 html_texts.append(text)
 
             elif isinstance(item, ListItem):
 
-                text = f"<li>{item.text}</li>"
+                text = f"<li>{_sanitize_text(item.text)}</li>"
                 html_texts.append(text)
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:
 
-                text = f"<li>{item.text}</li>"
+                text = f"<li>{_sanitize_text(item.text)}</li>"
                 html_texts.append(text)
 
             elif isinstance(item, CodeItem) and item.label in labels:
-                text = f"<pre><code>{item.text}</code></pre>"
+                text = (
+                    "<pre><code>"
+                    f"{_sanitize_text(item.text, do_escape_html=False)}"
+                    "</code></pre>"
+                )
                 html_texts.append(text.strip())
 
             elif isinstance(item, TextItem) and item.label in labels:
 
-                text = f"<p>{item.text}</p>"
+                text = f"<p>{_sanitize_text(item.text)}</p>"
                 html_texts.append(text.strip())
             elif isinstance(item, TableItem):