Skip to content

Commit

Permalink
add html escape in md export and fix formula escapes
Browse files Browse the repository at this point in the history
Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm committed Jan 31, 2025
1 parent 0519d50 commit 666ca02
Show file tree
Hide file tree
Showing 3 changed files with 63 additions and 53 deletions.
108 changes: 59 additions & 49 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import base64
import copy
import hashlib
import html
import json
import mimetypes
import os
Expand Down Expand Up @@ -2082,6 +2083,46 @@ def export_to_markdown( # noqa: C901
previous_level = 0 # Track the previous item's level
in_list = False # Track if we're currently processing list items

# Our export markdown doesn't contain any emphasis styling:
# Bold, Italic, or Bold-Italic
# Hence, any underscore that we print into Markdown is coming from document text
# That means we need to escape it, to properly reflect content in the markdown
# However, we need to preserve underscores in image URLs
# to maintain their validity
# For example: ![image](path/to_image.png) should remain unchanged
def _escape_underscores(text):
"""Escape underscores but leave them intact in the URL.."""
# Firstly, identify all the URL patterns.
url_pattern = r"!\[.*?\]\((.*?)\)"
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
combined_pattern = f"({url_pattern})|({latex_pattern})"

parts = []
last_end = 0

for match in re.finditer(combined_pattern, text):
# Text to add before the URL (needs to be escaped)
before_url = text[last_end : match.start()]
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))

# Add the full URL part (do not escape)
parts.append(match.group(0))
last_end = match.end()

# Add the final part of the text (which needs to be escaped)
if last_end < len(text):
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))

return "".join(parts)

def _append_text(text: str, do_escape_html=True, do_escape_underscores=True):
if do_escape_underscores and escaping_underscores:
text = _escape_underscores(text)
if do_escape_html:
text = html.escape(text, quote=False)
mdtexts.append(text)

for ix, (item, level) in enumerate(
self.iterate_items(self.body, with_groups=True, page_no=page_no)
):
Expand Down Expand Up @@ -2130,7 +2171,7 @@ def export_to_markdown( # noqa: C901
in_list = False
marker = "" if strict_text else "#"
text = f"{marker} {item.text}"
mdtexts.append(text.strip() + "\n")
_append_text(text.strip() + "\n")

elif (
isinstance(item, TextItem)
Expand All @@ -2143,12 +2184,12 @@ def export_to_markdown( # noqa: C901
if len(marker) < 2:
marker = "##"
text = f"{marker} {item.text}\n"
mdtexts.append(text.strip() + "\n")
_append_text(text.strip() + "\n")

elif isinstance(item, CodeItem) and item.label in labels:
in_list = False
text = f"```\n{item.text}\n```\n"
mdtexts.append(text)
_append_text(text, do_escape_underscores=False, do_escape_html=False)

elif isinstance(item, ListItem) and item.label in [DocItemLabel.LIST_ITEM]:
in_list = True
Expand All @@ -2165,85 +2206,54 @@ def export_to_markdown( # noqa: C901
marker = "-" # Markdown needs only dash as item marker.

text = f"{list_indent}{marker} {item.text}"
mdtexts.append(text)
_append_text(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:
in_list = False
mdtexts.append(f"$${item.text}$$\n")
_append_text(
f"$${item.text}$$\n",
do_escape_underscores=False,
do_escape_html=False,
)

elif isinstance(item, TextItem) and item.label in labels:
in_list = False
if len(item.text) and text_width > 0:
text = item.text
wrapped_text = textwrap.fill(text, width=text_width)
mdtexts.append(wrapped_text + "\n")
_append_text(wrapped_text + "\n")
elif len(item.text):
text = f"{item.text}\n"
mdtexts.append(text)
_append_text(text)

elif isinstance(item, TableItem) and not strict_text:
in_list = False
mdtexts.append(item.caption_text(self))
_append_text(item.caption_text(self))
md_table = item.export_to_markdown()
mdtexts.append("\n" + md_table + "\n")
_append_text("\n" + md_table + "\n")

elif isinstance(item, PictureItem) and not strict_text:
in_list = False
mdtexts.append(item.caption_text(self))
_append_text(item.caption_text(self))

line = item.export_to_markdown(
doc=self,
image_placeholder=image_placeholder,
image_mode=image_mode,
)

mdtexts.append(line)
_append_text(line, do_escape_html=False, do_escape_underscores=False)

elif isinstance(item, DocItem) and item.label in labels:
in_list = False
text = "<missing-text>"
mdtexts.append(text)
text = "<!-- missing-text -->"
_append_text(text, do_escape_html=False, do_escape_underscores=False)

mdtext = (delim.join(mdtexts)).strip()
mdtext = re.sub(
r"\n\n\n+", "\n\n", mdtext
) # remove cases of double or more empty lines.

# Our export markdown doesn't contain any emphasis styling:
# Bold, Italic, or Bold-Italic
# Hence, any underscore that we print into Markdown is coming from document text
# That means we need to escape it, to properly reflect content in the markdown
# However, we need to preserve underscores in image URLs
# to maintain their validity
# For example: ![image](path/to_image.png) should remain unchanged
def escape_underscores(text):
"""Escape underscores but leave them intact in the URL.."""
# Firstly, identify all the URL patterns.
url_pattern = r"!\[.*?\]\((.*?)\)"
# Matches both inline ($...$) and block ($$...$$) LaTeX equations:
latex_pattern = r"\$\$?(?:\\.|[^$\\])*\$\$?"
combined_pattern = f"({url_pattern})|({latex_pattern})"

parts = []
last_end = 0

for match in re.finditer(combined_pattern, text):
# Text to add before the URL (needs to be escaped)
before_url = text[last_end : match.start()]
parts.append(re.sub(r"(?<!\\)_", r"\_", before_url))

# Add the full URL part (do not escape)
parts.append(match.group(0))
last_end = match.end()

# Add the final part of the text (which needs to be escaped)
if last_end < len(text):
parts.append(re.sub(r"(?<!\\)_", r"\_", text[last_end:]))

return "".join(parts)

if escaping_underscores:
mdtext = escape_underscores(mdtext)

return mdtext

def export_to_text( # noqa: C901
Expand Down
6 changes: 3 additions & 3 deletions test/data/doc/2206.01062.yaml.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ In this paper, we present the DocLayNet dataset. It provides pageby-page layout

This enables experimentation with annotation uncertainty and quality control analysis.

- (5) Pre-defined Train-, Test- & Validation-set : Like DocBank, we provide fixed train-, test- & validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.
- (5) Pre-defined Train-, Test- &amp; Validation-set : Like DocBank, we provide fixed train-, test- &amp; validation-sets to ensure proportional representation of the class-labels. Further, we prevent leakage of unique layouts across sets, which has a large effect on model accuracy scores.

All aspects outlined above are detailed in Section 3. In Section 4, we will elaborate on how we designed and executed this large-scale human annotation campaign. We will also share key insights and lessons learned that might prove helpful for other parties planning to set up annotation campaigns.

Expand All @@ -71,7 +71,7 @@ Figure 2: Distribution of DocLayNet pages across document categories.

<!-- image -->

The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws & Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.
The pages in DocLayNet can be grouped into six distinct categories, namely Financial Reports , Manuals , Scientific Articles , Laws &amp; Regulations , Patents and Government Tenders . Each document category was sourced from various repositories. For example, Financial Reports contain both free-style format annual reports 2 which expose company-specific, artistic layouts as well as the more formal SEC filings. The two largest categories ( Financial Reports and Manuals ) contain a large amount of free-style layouts in order to obtain maximum variability. In the other four categories, we boosted the variability by mixing documents from independent providers, such as different government websites or publishers. In Figure 2, we show the document categories contained in DocLayNet with their respective sizes.

We did not control the document selection with regard to language. The vast majority of documents contained in DocLayNet (close to 95%) are published in English language. However, DocLayNet also contains a number of documents in other languages such as German (2.5%), French (1.0%) and Japanese (1.0%). While the document language has negligible impact on the performance of computer vision methods such as object detection and segmentation models, it might prove challenging for layout analysis methods which exploit textual features.

Expand Down Expand Up @@ -225,7 +225,7 @@ Many documents in DocLayNet have a unique styling. In order to avoid overfitting

Throughout this paper, we claim that DocLayNet's wider variety of document layouts leads to more robust layout detection models. In Table 5, we provide evidence for that. We trained models on each of the available datasets (PubLayNet, DocBank and DocLayNet) and evaluated them on the test sets of the other datasets. Due to the different label sets and annotation styles, a direct comparison is not possible. Hence, we focussed on the common labels among the datasets. Between PubLayNet and DocLayNet, these are Picture ,

Table 5: Prediction Performance ([email protected]) of a Mask R-CNN R50 network across the PubLayNet, DocBank & DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.
Table 5: Prediction Performance ([email protected]) of a Mask R-CNN R50 network across the PubLayNet, DocBank &amp; DocLayNet data-sets. By evaluating on common label classes of each dataset, we observe that the DocLayNet-trained model has much less pronounced variations in performance across all datasets.

| | Testing on | Testing on | Testing on |
|------------|--------------|--------------|--------------|
Expand Down
2 changes: 1 addition & 1 deletion test/data/doc/constructed_doc.embedded.md.gt
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ This is the caption of table 1.

This is the caption of figure 1.

<!-- 🖼️❌ Image not available. Please use `PdfPipelineOptions(generate\_picture\_images=True)` -->
<!-- 🖼️❌ Image not available. Please use `PdfPipelineOptions(generate_picture_images=True)` -->

This is the caption of figure 2.

Expand Down

0 comments on commit 666ca02

Please sign in to comment.