From 7adea4af08cb9c8976ba297d52aa147fa6e00d30 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 31 Jan 2025 16:51:29 +0100 Subject: [PATCH 1/2] fix(markdown): add support for HTML content Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/md_backend.py | 54 +++++++++++++++++-- tests/data/groundtruth/docling_v2/mixed.md.md | 25 +++++++++ tests/data/md/mixed.md | 54 +++++++++++++++++++ 3 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/mixed.md.md create mode 100644 tests/data/md/mixed.md diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 3e23f073b..669096ebd 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -24,11 +24,16 @@ from marko import Markdown from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.html_backend import HTMLDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) +_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT" +_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#" +_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#" + class MarkdownDocumentBackend(DeclarativeDocumentBackend): def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): @@ -67,6 +72,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self.in_table = False self.md_table_buffer: list[str] = [] self.inline_texts: list[str] = [] + self._html_blocks: int = 0 try: if isinstance(self.path_or_stream, BytesIO): @@ -295,16 +301,18 @@ def traverse(node: marko.block.BlockElement): self.md_table_buffer.append("") elif isinstance(element, marko.block.HTMLBlock): + self._html_blocks += 1 self.process_inline_text(parent_element, doc) self.close_table(doc) _log.debug("HTML Block: {}".format(element)) if ( - len(element.children) > 0 + len(element.body) > 0 ): # If Marko doesn't return any content for HTML block, skip it - snippet_text = str(element.children).strip() - doc.add_text( - label=DocItemLabel.CODE, parent=parent_element, text=snippet_text - ) + html_block = element.body.strip() + + # wrap in markers to enable post-processing in convert() + text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}" + doc.add_code(parent=parent_element, text=text_to_add) else: if not isinstance(element, str): self.close_table(doc) @@ -360,6 +368,42 @@ def convert(self) -> DoclingDocument: # Start iterating from the root of the AST self.iterate_elements(parsed_ast, 0, doc, None) self.process_inline_text(None, doc) # handle last hanging inline text + + # if HTML blocks were detected, export to HTML and delegate to HTML backend + if self._html_blocks > 0: + + # export to HTML + html_backend_cls = HTMLDocumentBackend + html_str = doc.export_to_html() + + def _restore_original_html(txt, regex): + _txt, count = re.subn(regex, "", txt) + if count != self._html_blocks: + raise RuntimeError( + "An internal error has occurred during Markdown conversion." + ) + return _txt + + # restore original HTML by removing previouly added markers + for regex in [ + rf"
\s*\s*{_START_MARKER}",
+ rf"{_STOP_MARKER}\s*
\s*
",
+ ]:
+ html_str = _restore_original_html(txt=html_str, regex=regex)
+ self._html_blocks = 0
+
+ # delegate to HTML backend
+ stream = BytesIO(bytes(html_str, encoding="utf-8"))
+ in_doc = InputDocument(
+ path_or_stream=stream,
+ format=InputFormat.HTML,
+ backend=html_backend_cls,
+ filename=self.file.name,
+ )
+ html_backend_obj = html_backend_cls(
+ in_doc=in_doc, path_or_stream=stream
+ )
+ doc = html_backend_obj.convert()
else:
raise RuntimeError(
f"Cannot convert md with {self.document_hash} because the backend failed to init."
diff --git a/tests/data/groundtruth/docling_v2/mixed.md.md b/tests/data/groundtruth/docling_v2/mixed.md.md
new file mode 100644
index 000000000..6cd5d52b1
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/mixed.md.md
@@ -0,0 +1,25 @@
+# Title
+
+Some text
+
+## Famous ducks
+
+Here is a table:
+
+| Character | Name in German | Name in French | Name in Italian |
+|----------------|------------------|------------------|-------------------|
+| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone |
+| Huey | Tick | Riri | Qui |
+| Dewey | Trick | Fifi | Quo |
+| Louie | Track | Loulou | Qua |
+
+And here is more HTML:
+
+Some paragraph.
+
+Now a div — almost there...
+
+- foo
+- bar
+
+The end!
diff --git a/tests/data/md/mixed.md b/tests/data/md/mixed.md
new file mode 100644
index 000000000..470ffb03a
--- /dev/null
+++ b/tests/data/md/mixed.md
@@ -0,0 +1,54 @@
+# Title
+
+Some text
+
+## Famous ducks
+
+Here is a table:
+
+Character | +Name in German | +Name in French | +Name in Italian | +
---|---|---|---|
Scrooge McDuck | +Dagobert Duck | +Balthazar Picsou | +Paperone | +
Huey | +Tick | +Riri | +Qui | +
Dewey | +Trick | +Fifi | +Quo | +
Louie | +Track | +Loulou | +Qua | +
Some paragraph.
+ +Now a div — almost there...
+