From 7adea4af08cb9c8976ba297d52aa147fa6e00d30 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Fri, 31 Jan 2025 16:51:29 +0100 Subject: [PATCH 1/2] fix(markdown): add support for HTML content Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docling/backend/md_backend.py | 54 +++++++++++++++++-- tests/data/groundtruth/docling_v2/mixed.md.md | 25 +++++++++ tests/data/md/mixed.md | 54 +++++++++++++++++++ 3 files changed, 128 insertions(+), 5 deletions(-) create mode 100644 tests/data/groundtruth/docling_v2/mixed.md.md create mode 100644 tests/data/md/mixed.md diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 3e23f073b..669096ebd 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -24,11 +24,16 @@ from marko import Markdown from docling.backend.abstract_backend import DeclarativeDocumentBackend +from docling.backend.html_backend import HTMLDocumentBackend from docling.datamodel.base_models import InputFormat from docling.datamodel.document import InputDocument _log = logging.getLogger(__name__) +_MARKER_BODY = "DOCLING_DOC_MD_HTML_EXPORT" +_START_MARKER = f"#_#_{_MARKER_BODY}_START_#_#" +_STOP_MARKER = f"#_#_{_MARKER_BODY}_STOP_#_#" + class MarkdownDocumentBackend(DeclarativeDocumentBackend): def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): @@ -67,6 +72,7 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] self.in_table = False self.md_table_buffer: list[str] = [] self.inline_texts: list[str] = [] + self._html_blocks: int = 0 try: if isinstance(self.path_or_stream, BytesIO): @@ -295,16 +301,18 @@ def traverse(node: marko.block.BlockElement): self.md_table_buffer.append("") elif isinstance(element, marko.block.HTMLBlock): + self._html_blocks += 1 self.process_inline_text(parent_element, doc) self.close_table(doc) _log.debug("HTML Block: {}".format(element)) if ( - len(element.children) > 0 + len(element.body) > 0 ): # If Marko doesn't return any content for HTML block, skip it - snippet_text = str(element.children).strip() - doc.add_text( - label=DocItemLabel.CODE, parent=parent_element, text=snippet_text - ) + html_block = element.body.strip() + + # wrap in markers to enable post-processing in convert() + text_to_add = f"{_START_MARKER}{html_block}{_STOP_MARKER}" + doc.add_code(parent=parent_element, text=text_to_add) else: if not isinstance(element, str): self.close_table(doc) @@ -360,6 +368,42 @@ def convert(self) -> DoclingDocument: # Start iterating from the root of the AST self.iterate_elements(parsed_ast, 0, doc, None) self.process_inline_text(None, doc) # handle last hanging inline text + + # if HTML blocks were detected, export to HTML and delegate to HTML backend + if self._html_blocks > 0: + + # export to HTML + html_backend_cls = HTMLDocumentBackend + html_str = doc.export_to_html() + + def _restore_original_html(txt, regex): + _txt, count = re.subn(regex, "", txt) + if count != self._html_blocks: + raise RuntimeError( + "An internal error has occurred during Markdown conversion." + ) + return _txt + + # restore original HTML by removing previouly added markers + for regex in [ + rf"
\s*\s*{_START_MARKER}",
+                    rf"{_STOP_MARKER}\s*\s*
", + ]: + html_str = _restore_original_html(txt=html_str, regex=regex) + self._html_blocks = 0 + + # delegate to HTML backend + stream = BytesIO(bytes(html_str, encoding="utf-8")) + in_doc = InputDocument( + path_or_stream=stream, + format=InputFormat.HTML, + backend=html_backend_cls, + filename=self.file.name, + ) + html_backend_obj = html_backend_cls( + in_doc=in_doc, path_or_stream=stream + ) + doc = html_backend_obj.convert() else: raise RuntimeError( f"Cannot convert md with {self.document_hash} because the backend failed to init." diff --git a/tests/data/groundtruth/docling_v2/mixed.md.md b/tests/data/groundtruth/docling_v2/mixed.md.md new file mode 100644 index 000000000..6cd5d52b1 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/mixed.md.md @@ -0,0 +1,25 @@ +# Title + +Some text + +## Famous ducks + +Here is a table: + +| Character | Name in German | Name in French | Name in Italian | +|----------------|------------------|------------------|-------------------| +| Scrooge McDuck | Dagobert Duck | Balthazar Picsou | Paperone | +| Huey | Tick | Riri | Qui | +| Dewey | Trick | Fifi | Quo | +| Louie | Track | Loulou | Qua | + +And here is more HTML: + +Some paragraph. + +Now a div — almost there... + +- foo +- bar + +The end! diff --git a/tests/data/md/mixed.md b/tests/data/md/mixed.md new file mode 100644 index 000000000..470ffb03a --- /dev/null +++ b/tests/data/md/mixed.md @@ -0,0 +1,54 @@ +# Title + +Some text + +## Famous ducks + +Here is a table: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
CharacterName in GermanName in FrenchName in Italian
Scrooge McDuckDagobert DuckBalthazar PicsouPaperone
HueyTickRiriQui
DeweyTrickFifiQuo
LouieTrackLoulouQua
+ +And here is more HTML: + +

Some paragraph.

+ +
+

Now a div — almost there...

+ +
+ +The end! From f4b30fe7a7166b2231ff938e403071d852986fb5 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 3 Feb 2025 11:21:46 +0100 Subject: [PATCH 2/2] fix word test data Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- .../groundtruth/docling_v2/word_tables.docx.html | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/data/groundtruth/docling_v2/word_tables.docx.html b/tests/data/groundtruth/docling_v2/word_tables.docx.html index 30f6e8d35..00a790c07 100644 --- a/tests/data/groundtruth/docling_v2/word_tables.docx.html +++ b/tests/data/groundtruth/docling_v2/word_tables.docx.html @@ -53,6 +53,20 @@ table tr:nth-child(even) td{ background-color: LightGray; } + math annotation { + display: none; + } + .formula-not-decoded { + background: repeating-linear-gradient( + 45deg, /* Angle of the stripes */ + LightGray, /* First color */ + LightGray 10px, /* Length of the first color */ + White 10px, /* Second color */ + White 20px /* Length of the second color */ + ); + margin: 0; + text-align: center; + }

Test with tables