add html escape in md export and fix formula escapes

Signed-off-by: Michele Dolfi <[email protected]>
DS4SD · Jan 31, 2025 · 666ca02 · 666ca02
1 parent 0519d50
commit 666ca02
Show file tree

Hide file tree

Showing 3 changed files with 63 additions and 53 deletions.
diff --git a/docling_core/types/doc/document.py b/docling_core/types/doc/document.py
@@ -3,6 +3,7 @@
 import base64
 import copy
 import hashlib
+import html
 import json
 import mimetypes
 import os
@@ -2082,6 +2083,46 @@ def export_to_markdown(  # noqa: C901
         previous_level = 0  # Track the previous item's level
         in_list = False  # Track if we're currently processing list items
 
+        # Our export markdown doesn't contain any emphasis styling:
+        # Bold, Italic, or Bold-Italic
+        # Hence, any underscore that we print into Markdown is coming from document text
+        # That means we need to escape it, to properly reflect content in the markdown
+        # However, we need to preserve underscores in image URLs
+        # to maintain their validity
+        # For example: ![image](path/to_image.png) should remain unchanged
+        def _escape_underscores(text):
+            """Escape underscores but leave them intact in the URL.."""
+            # Firstly, identify all the URL patterns.
+            url_pattern = r"!\[.*?\]\((.*?)\)"
+            # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
+            latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
+            combined_pattern = f"({url_pattern})|({latex_pattern})"
+
+            parts = []
+            last_end = 0
+
+            for match in re.finditer(combined_pattern, text):
+                # Text to add before the URL (needs to be escaped)
+                before_url = text[last_end : match.start()]
+                parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
+
+                # Add the full URL part (do not escape)
+                parts.append(match.group(0))
+                last_end = match.end()
+
+            # Add the final part of the text (which needs to be escaped)
+            if last_end < len(text):
+                parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
+
+            return "".join(parts)
+
+        def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
+            if do_escape_underscores and escaping_underscores:
+                text = _escape_underscores(text)
+            if do_escape_html:
+                text = html.escape(text, quote=False)
+            mdtexts.append(text)
+
         for ix, (item, level) in enumerate(
             self.iterate_items(self.body, with_groups=True, page_no=page_no)
         ):
@@ -2130,7 +2171,7 @@ def export_to_markdown(  # noqa: C901
                 in_list = False
                 marker = "" if strict_text else "#"
                 text = f"{marker} {item.text}"
-                mdtexts.append(text.strip() + "\n")
+                _append_text(text.strip() + "\n")
 
             elif (
                 isinstance(item, TextItem)
@@ -2143,12 +2184,12 @@ def export_to_markdown(  # noqa: C901
                     if len(marker) < 2:
                         marker = "##"
                 text = f"{marker} {item.text}\n"
-                mdtexts.append(text.strip() + "\n")
+                _append_text(text.strip() + "\n")
 
             elif isinstance(item, CodeItem) and item.label in labels:
                 in_list = False
                 text = f"```\n{item.text}\n```\n"
-                mdtexts.append(text)
+                _append_text(text, do_escape_underscores=False, do_escape_html=False)
 
             elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
                 in_list = True
@@ -2165,85 +2206,54 @@ def export_to_markdown(  # noqa: C901
                     marker = "-"  # Markdown needs only dash as item marker.
 
                 text = f"{list_indent}{marker} {item.text}"
-                mdtexts.append(text)
+                _append_text(text)
 
             elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
                 in_list = False
-                mdtexts.append(f"$${item.text}$$\n")
+                _append_text(
+                    f"$${item.text}$$\n",
+                    do_escape_underscores=False,
+                    do_escape_html=False,
+                )
 
             elif isinstance(item, TextItem) and item.label in labels:
                 in_list = False
                 if len(item.text) and text_width > 0:
+                    text = item.text
                     wrapped_text = textwrap.fill(text, width=text_width)
-                    mdtexts.append(wrapped_text + "\n")
+                    _append_text(wrapped_text + "\n")
                 elif len(item.text):
                     text = f"{item.text}\n"
-                    mdtexts.append(text)
+                    _append_text(text)
 
             elif isinstance(item, TableItem) and not strict_text:
                 in_list = False
-                mdtexts.append(item.caption_text(self))
+                _append_text(item.caption_text(self))
                 md_table = item.export_to_markdown()
-                mdtexts.append("\n" + md_table + "\n")
+                _append_text("\n" + md_table + "\n")
 
             elif isinstance(item, PictureItem) and not strict_text:
                 in_list = False
-                mdtexts.append(item.caption_text(self))
+                _append_text(item.caption_text(self))
 
                 line = item.export_to_markdown(
                     doc=self,
                     image_placeholder=image_placeholder,
                     image_mode=image_mode,
                 )
 
-                mdtexts.append(line)
+                _append_text(line, do_escape_html=False, do_escape_underscores=False)
 
             elif isinstance(item, DocItem) and item.label in labels:
                 in_list = False
-                text = "<missing-text>"
-                mdtexts.append(text)
+                text = "<!-- missing-text -->"
+                _append_text(text, do_escape_html=False, do_escape_underscores=False)
 
         mdtext = (delim.join(mdtexts)).strip()
         mdtext = re.sub(
             r"\n\n\n+", "\n\n", mdtext
         )  # remove cases of double or more empty lines.
 
-        # Our export markdown doesn't contain any emphasis styling:
-        # Bold, Italic, or Bold-Italic
-        # Hence, any underscore that we print into Markdown is coming from document text
-        # That means we need to escape it, to properly reflect content in the markdown
-        # However, we need to preserve underscores in image URLs
-        # to maintain their validity
-        # For example: ![image](path/to_image.png) should remain unchanged
-        def escape_underscores(text):
-            """Escape underscores but leave them intact in the URL.."""
-            # Firstly, identify all the URL patterns.
-            url_pattern = r"!\[.*?\]\((.*?)\)"
-            # Matches both inline ($...$) and block ($$...$$) LaTeX equations:
-            latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
-            combined_pattern = f"({url_pattern})|({latex_pattern})"
-
-            parts = []
-            last_end = 0
-
-            for match in re.finditer(combined_pattern, text):
-                # Text to add before the URL (needs to be escaped)
-                before_url = text[last_end : match.start()]
-                parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))
-
-                # Add the full URL part (do not escape)
-                parts.append(match.group(0))
-                last_end = match.end()
-
-            # Add the final part of the text (which needs to be escaped)
-            if last_end < len(text):
-                parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))
-
-            return "".join(parts)
-
-        if escaping_underscores:
-            mdtext = escape_underscores(mdtext)
-
         return mdtext
 
     def export_to_text(  # noqa: C901

diff --git a/test/data/doc/2206.01062.yaml.md b/test/data/doc/2206.01062.yaml.md
@@ -49,7 +49,7 @@ In this paper, we present the DocLayNet dataset. It provides pageby-page layout
 
 This enables experimentation with annotation uncertainty and quality control analysis.
 
-- (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
+- (5) Pre-defined Train-, Test- &amp; Validation-set : Like DocBank, we provide fixed train-, test- &amp; validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
 
 All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.
 
@@ -71,7 +71,7 @@ Figure 2: Distribution of DocLayNet pages across document categories.
 
 <!-- image -->
 
-The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.
+The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws &amp; Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.
 
 We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.
 
@@ -225,7 +225,7 @@ Many documents in DocLayNet have a unique styling. In order to avoid overfitting
 
 Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,
 
-Table 5: Prediction Performance ([email protected]) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
+Table 5: Prediction Performance ([email protected]) of a Mask R-CNN R50 network across the PubLayNet, DocBank &amp; DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
 
 |            | Testing on   | Testing on   | Testing on   |
 |------------|--------------|--------------|--------------|

diff --git a/test/data/doc/constructed_doc.embedded.md.gt b/test/data/doc/constructed_doc.embedded.md.gt
@@ -27,7 +27,7 @@ This is the caption of table 1.
 
 This is the caption of figure 1.
 
-<!-- 🖼️❌ Image not available. Please use `PdfPipelineOptions(generate\_picture\_images=True)` --> 
+<!-- 🖼️❌ Image not available. Please use `PdfPipelineOptions(generate_picture_images=True)` --> 
 
 This is the caption of figure 2.