Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(markdown): add support for HTML content #855

Merged
merged 2 commits into from
Feb 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 49 additions & 5 deletions docling/backend/md_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,16 @@
from marko import Markdown

from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.backend.html_backend import HTMLDocumentBackend
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument

_log = logging.getLogger(__name__)

_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT"
_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#"
_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#"


class MarkdownDocumentBackend(DeclarativeDocumentBackend):
def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
Expand Down Expand Up @@ -67,6 +72,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
self.in_table = False
self.md_table_buffer: list[str] = []
self.inline_texts: list[str] = []
self._html_blocks: int = 0

try:
if isinstance(self.path_or_stream, BytesIO):
Expand Down Expand Up @@ -295,16 +301,18 @@ def traverse(node: marko.block.BlockElement):
self.md_table_buffer.append("")

elif isinstance(element, marko.block.HTMLBlock):
self._html_blocks += 1
self.process_inline_text(parent_element, doc)
self.close_table(doc)
_log.debug("HTML Block: {}".format(element))
if (
len(element.children) > 0
len(element.body) > 0
): # If Marko doesn't return any content for HTML block, skip it
snippet_text = str(element.children).strip()
doc.add_text(
label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
)
html_block = element.body.strip()

# wrap in markers to enable post-processing in convert()
text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}"
doc.add_code(parent=parent_element, text=text_to_add)
else:
if not isinstance(element, str):
self.close_table(doc)
Expand Down Expand Up @@ -360,6 +368,42 @@ def convert(self) -> DoclingDocument:
# Start iterating from the root of the AST
self.iterate_elements(parsed_ast, 0, doc, None)
self.process_inline_text(None, doc) # handle last hanging inline text

# if HTML blocks were detected, export to HTML and delegate to HTML backend
if self._html_blocks > 0:

# export to HTML
html_backend_cls = HTMLDocumentBackend
html_str = doc.export_to_html()

def _restore_original_html(txt, regex):
_txt, count = re.subn(regex, "", txt)
if count != self._html_blocks:
raise RuntimeError(
"An internal error has occurred during Markdown conversion."
)
return _txt

# restore original HTML by removing previouly added markers
for regex in [
rf"<pre>\s*<code>\s*{_START_MARKER}",
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
]:
html_str = _restore_original_html(txt=html_str, regex=regex)
self._html_blocks = 0

# delegate to HTML backend
stream = BytesIO(bytes(html_str, encoding="utf-8"))
in_doc = InputDocument(
path_or_stream=stream,
format=InputFormat.HTML,
backend=html_backend_cls,
filename=self.file.name,
)
html_backend_obj = html_backend_cls(
in_doc=in_doc, path_or_stream=stream
)
doc = html_backend_obj.convert()
else:
raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init."
Expand Down
25 changes: 25 additions & 0 deletions tests/data/groundtruth/docling_v2/mixed.md.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Title

Some text

## Famous ducks

Here is a table:

| Character | Name in German | Name in French | Name in Italian |
|----------------|------------------|------------------|-------------------|
| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
| Huey | Tick | Riri | Qui |
| Dewey | Trick | Fifi | Quo |
| Louie | Track | Loulou | Qua |

And here is more HTML:

Some paragraph.

Now a div — almost there...

- foo
- bar

The end!
14 changes: 14 additions & 0 deletions tests/data/groundtruth/docling_v2/word_tables.docx.html
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,20 @@
table tr:nth-child(even) td{
background-color: LightGray;
}
math annotation {
display: none;
}
.formula-not-decoded {
background: repeating-linear-gradient(
45deg, /* Angle of the stripes */
LightGray, /* First color */
LightGray 10px, /* Length of the first color */
White 10px, /* Second color */
White 20px /* Length of the second color */
);
margin: 0;
text-align: center;
}
</style>
</head>
<h2>Test with tables</h2>
Expand Down
54 changes: 54 additions & 0 deletions tests/data/md/mixed.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Title

Some text

## Famous ducks

Here is a table:

<table>
<tr>
<th>Character</th>
<th>Name in German</th>
<th>Name in French</th>
<th>Name in Italian</th>
</tr>
<tr>
<td>Scrooge McDuck</td>
<td>Dagobert Duck</td>
<td>Balthazar Picsou</td>
<td>Paperone</td>
</tr>
<tr>
<td>Huey</td>
<td>Tick</td>
<td>Riri</td>
<td>Qui</td>
</tr>
<tr>
<td>Dewey</td>
<td>Trick</td>
<td>Fifi</td>
<td>Quo</td>
</tr>
<tr>
<td>Louie</td>
<td>Track</td>
<td>Loulou</td>
<td>Qua</td>
</tr>
</table>

And here is more HTML:

<p>Some paragraph.</p>

<div>
<p>Now a div — almost there...</p>
<ul>
<li>foo</li>
<li>bar</li>
</ul>
</div>

The end!