Skip to content

Commit

Permalink
feat(HTML): Export formulas with mathml (#144)
Browse files Browse the repository at this point in the history
* remove un-needed logic

the labels allowlist is checked before

Signed-off-by: Michele Dolfi <[email protected]>

* textitem cannot have label code

Signed-off-by: Michele Dolfi <[email protected]>

* display formulas with mathml in exported html

Signed-off-by: Michele Dolfi <[email protected]>

* expose argument in save_as_html

Signed-off-by: Michele Dolfi <[email protected]>

* rename sanitize in prepare and add \n

Signed-off-by: Michele Dolfi <[email protected]>

* fix mypy parsing

Signed-off-by: Michele Dolfi <[email protected]>

* remove unused/impossible elif

Signed-off-by: Michele Dolfi <[email protected]>

* remove strip()

Signed-off-by: Michele Dolfi <[email protected]>

* add display none for latex annotation

Signed-off-by: Michele Dolfi <[email protected]>

* make mathml the default

Signed-off-by: Michele Dolfi <[email protected]>

* revert wrong commit

Signed-off-by: Michele Dolfi <[email protected]>

---------

Signed-off-by: Michele Dolfi <[email protected]>
  • Loading branch information
dolfim-ibm authored Jan 31, 2025
1 parent eb96e31 commit ed36437
Show file tree
Hide file tree
Showing 12 changed files with 102 additions and 39 deletions.
87 changes: 49 additions & 38 deletions docling_core/types/doc/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@
from pathlib import Path
from typing import Any, Dict, Final, List, Literal, Optional, Tuple, Union
from urllib.parse import quote, unquote
from xml.etree.cElementTree import SubElement, tostring
from xml.sax.saxutils import unescape

import latex2mathml.converter
import pandas as pd
import yaml
from PIL import Image as PILImage
Expand Down Expand Up @@ -1387,6 +1390,9 @@ class DoclingDocument(BaseModel):
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>"""

Expand Down Expand Up @@ -2282,6 +2288,7 @@ def save_as_html(
to_element: int = sys.maxsize,
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
formula_to_mathml: bool = True,
page_no: Optional[int] = None,
html_lang: str = "en",
html_head: str = _HTML_DEFAULT_HEAD,
Expand All @@ -2301,6 +2308,7 @@ def save_as_html(
to_element=to_element,
labels=labels,
image_mode=image_mode,
formula_to_mathml=formula_to_mathml,
page_no=page_no,
html_lang=html_lang,
html_head=html_head,
Expand Down Expand Up @@ -2347,6 +2355,7 @@ def export_to_html( # noqa: C901
to_element: int = sys.maxsize,
labels: set[DocItemLabel] = DEFAULT_EXPORT_LABELS,
image_mode: ImageRefMode = ImageRefMode.PLACEHOLDER,
formula_to_mathml: bool = True,
page_no: Optional[int] = None,
html_lang: str = "en",
html_head: str = _HTML_DEFAULT_HEAD,
Expand Down Expand Up @@ -2381,9 +2390,13 @@ def close_lists(

in_ordered_list: List[bool] = [] # False

def _sanitize_text(text: str, do_escape_html=True) -> str:
def _prepare_tag_content(
text: str, do_escape_html=True, do_replace_newline=True
) -> str:
if do_escape_html:
text = html.escape(text, quote=False)
if do_replace_newline:
text = text.replace("\n", "<br>")
return text

for ix, (item, curr_level) in enumerate(
Expand Down Expand Up @@ -2416,7 +2429,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
]:

text = "<ol>"
html_texts.append(text.strip())
html_texts.append(text)

# Increment list nesting level when entering a new list
in_ordered_list.append(True)
Expand All @@ -2426,7 +2439,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:
]:

text = "<ul>"
html_texts.append(text.strip())
html_texts.append(text)

# Increment list nesting level when entering a new list
in_ordered_list.append(False)
Expand All @@ -2436,63 +2449,62 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:

elif isinstance(item, TextItem) and item.label in [DocItemLabel.TITLE]:

text = f"<h1>{_sanitize_text(item.text)}</h1>"
html_texts.append(text.strip())
text = f"<h1>{_prepare_tag_content(item.text)}</h1>"
html_texts.append(text)

elif isinstance(item, SectionHeaderItem):

section_level: int = item.level + 1
section_level: int = min(item.level + 1, 6)

text = (
f"<h{(section_level)}>"
f"{_sanitize_text(item.text)}</h{(section_level)}>"
f"{_prepare_tag_content(item.text)}</h{(section_level)}>"
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [
DocItemLabel.SECTION_HEADER
]:

section_level = curr_level

if section_level <= 1:
section_level = 2
html_texts.append(text)

if section_level >= 6:
section_level = 6
elif isinstance(item, TextItem) and item.label in [DocItemLabel.FORMULA]:

text = (
f"<h{section_level}>{_sanitize_text(item.text)}</h{section_level}>"
math_formula = _prepare_tag_content(
item.text, do_escape_html=False, do_replace_newline=False
)
html_texts.append(text.strip())

elif isinstance(item, TextItem) and item.label in [DocItemLabel.CODE]:
if formula_to_mathml:
# Building a math equation in MathML format
# ref https://www.w3.org/TR/wai-aria-1.1/#math
mathml_element = latex2mathml.converter.convert_to_element(
math_formula, display="block"
)
annotation = SubElement(
mathml_element, "annotation", dict(encoding="TeX")
)
annotation.text = math_formula
mathml = unescape(tostring(mathml_element, encoding="unicode"))
text = f"<div>{mathml}</div>"

text = f"<pre>{_sanitize_text(item.text, do_escape_html=False)}</pre>"
else:
text = f"<pre>{math_formula}</pre>"
html_texts.append(text)

elif isinstance(item, ListItem):

text = f"<li>{_sanitize_text(item.text)}</li>"
text = f"<li>{_prepare_tag_content(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in [DocItemLabel.LIST_ITEM]:

text = f"<li>{_sanitize_text(item.text)}</li>"
text = f"<li>{_prepare_tag_content(item.text)}</li>"
html_texts.append(text)

elif isinstance(item, CodeItem) and item.label in labels:
text = (
"<pre><code>"
f"{_sanitize_text(item.text, do_escape_html=False)}"
"</code></pre>"
elif isinstance(item, CodeItem):
code_text = _prepare_tag_content(
item.text, do_escape_html=False, do_replace_newline=False
)
html_texts.append(text.strip())
text = f"<pre><code>{code_text}</code></pre>"
html_texts.append(text)

elif isinstance(item, TextItem) and item.label in labels:
elif isinstance(item, TextItem):

text = f"<p>{_sanitize_text(item.text)}</p>"
html_texts.append(text.strip())
text = f"<p>{_prepare_tag_content(item.text)}</p>"
html_texts.append(text)
elif isinstance(item, TableItem):

text = item.export_to_html(doc=self, add_caption=True)
Expand All @@ -2513,8 +2525,7 @@ def _sanitize_text(text: str, do_escape_html=True) -> str:

lines = []
lines.extend(head_lines)
for i, line in enumerate(html_texts):
lines.append(line.replace("\n", "<br>"))
lines.extend(html_texts)

delim = "\n"
html_text = (delim.join(lines)).strip()
Expand Down
13 changes: 12 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ typing-extensions = "^4.12.2"
transformers = { version = "^4.34.0", optional = true }
semchunk = { version = "^2.2.0", optional = true }
typer = "^0.12.5"
latex2mathml = "^3.77.0"

[tool.poetry.extras]
chunking = ["transformers", "semchunk"]
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/2206.01062.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h2>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h2>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/bad_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>This is the title</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/constructed_doc.embedded.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/constructed_doc.placeholder.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/constructed_doc.referenced.html.gt
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/constructed_document.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>Title of the Document</h1>
Expand Down
3 changes: 3 additions & 0 deletions test/data/doc/dummy_doc.yaml.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
</style>
</head>
<h1>DocLayNet: A Large Human-Annotated Dataset for Document-Layout Analysis</h1>
Expand Down
5 changes: 5 additions & 0 deletions test/data/docling_document/export/formula_mathml.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
<!DOCTYPE html>
<html lang="en">

<div><math xmlns="http://www.w3.org/1998/Math/MathML" display="block"><mrow><mfrac><mrow><mn>1</mn></mrow><mrow><mi>x</mi></mrow></mfrac></mrow><annotation encoding="TeX">\frac{1}{x}</annotation></math></div>
</html>
14 changes: 14 additions & 0 deletions test/test_docling_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,20 @@ def test_version_doc():
assert doc.version == CURRENT_VERSION


def test_formula_mathml():
doc = DoclingDocument(name="Dummy")
equation = "\\frac{1}{x}"
doc.add_text(label=DocItemLabel.FORMULA, text=equation)

doc_html = doc.export_to_html(formula_to_mathml=True, html_head="")

gt_html = Path("test/data/docling_document/export/formula_mathml.html").read_text(
encoding="utf8"
)

assert doc_html == gt_html


def test_docitem_get_image():
# Prepare the document
doc = DoclingDocument(name="Dummy")
Expand Down

0 comments on commit ed36437

Please sign in to comment.