From d5b2c0729559f9d9c1415101b500252c27142456 Mon Sep 17 00:00:00 2001 From: Michele Dolfi Date: Tue, 21 Jan 2025 18:19:49 +0100 Subject: [PATCH] use new add_code in backends and update typing in MD backend Signed-off-by: Michele Dolfi --- docling/backend/html_backend.py | 2 +- docling/backend/md_backend.py | 70 +++++++++++++++++++++------------ 2 files changed, 45 insertions(+), 27 deletions(-) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index ae478885..66dd4a2c 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -215,7 +215,7 @@ def handle_code(self, element, idx, doc): label = DocItemLabel.CODE if len(text) == 0: return - doc.add_text(parent=self.parents[self.level], label=label, text=text) + doc.add_code(parent=self.parents[self.level], label=label, text=text) def handle_paragraph(self, element, idx, doc): """Handles paragraph tags (p).""" diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 25bc3682..8171085c 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -3,19 +3,22 @@ import warnings from io import BytesIO from pathlib import Path -from typing import Set, Union +from typing import List, Optional, Set, Union import marko import marko.ext import marko.ext.gfm import marko.inline from docling_core.types.doc import ( + DocItem, DocItemLabel, DoclingDocument, DocumentOrigin, GroupLabel, + NodeItem, TableCell, TableData, + TextItem, ) from marko import Markdown @@ -27,7 +30,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend): - def shorten_underscore_sequences(self, markdown_text, max_length=10): + def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10): # This regex will match any sequence of underscores pattern = r"_+" @@ -89,13 +92,13 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path] ) from e return - def close_table(self, doc=None): + def close_table(self, doc: DoclingDocument): if self.in_table: _log.debug("=== TABLE START ===") for md_table_row in self.md_table_buffer: _log.debug(md_table_row) _log.debug("=== TABLE END ===") - tcells = [] + tcells: List[TableCell] = [] result_table = [] for n, md_table_row in enumerate(self.md_table_buffer): data = [] @@ -136,15 +139,19 @@ def close_table(self, doc=None): self.in_table = False self.md_table_buffer = [] # clean table markdown buffer # Initialize Docling TableData - data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells) + table_data = TableData( + num_rows=num_rows, num_cols=num_cols, table_cells=tcells + ) # Populate for tcell in tcells: - data.table_cells.append(tcell) + table_data.table_cells.append(tcell) if len(tcells) > 0: - doc.add_table(data=data) + doc.add_table(data=table_data) return - def process_inline_text(self, parent_element, doc=None): + def process_inline_text( + self, parent_element: Optional[NodeItem], doc: DoclingDocument + ): # self.inline_text_buffer += str(text_in) txt = self.inline_text_buffer.strip() if len(txt) > 0: @@ -155,14 +162,20 @@ def process_inline_text(self, parent_element, doc=None): ) self.inline_text_buffer = "" - def iterate_elements(self, element, depth=0, doc=None, parent_element=None): + def iterate_elements( + self, + element: marko.block.Element, + depth: int, + doc: DoclingDocument, + parent_element: Optional[NodeItem] = None, + ): # Iterates over all elements in the AST # Check for different element types and process relevant details if isinstance(element, marko.block.Heading): self.close_table(doc) self.process_inline_text(parent_element, doc) _log.debug( - f" - Heading level {element.level}, content: {element.children[0].children}" + f" - Heading level {element.level}, content: {element.children[0].children}" # type: ignore ) if element.level == 1: doc_label = DocItemLabel.TITLE @@ -171,10 +184,10 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None): # Header could have arbitrary inclusion of bold, italic or emphasis, # hence we need to traverse the tree to get full text of a header - strings = [] + strings: List[str] = [] # Define a recursive function to traverse the tree - def traverse(node): + def traverse(node: marko.block.BlockElement): # Check if the node has a "children" attribute if hasattr(node, "children"): # If "children" is a list, continue traversal @@ -208,9 +221,13 @@ def traverse(node): self.process_inline_text(parent_element, doc) _log.debug(" - List item") - snippet_text = str(element.children[0].children[0].children) + snippet_text = str(element.children[0].children[0].children) # type: ignore is_numbered = False - if parent_element.label == GroupLabel.ORDERED_LIST: + if ( + parent_element is not None + and isinstance(parent_element, DocItem) + and parent_element.label == GroupLabel.ORDERED_LIST + ): is_numbered = True doc.add_list_item( enumerated=is_numbered, parent=parent_element, text=snippet_text @@ -220,7 +237,14 @@ def traverse(node): self.close_table(doc) self.process_inline_text(parent_element, doc) _log.debug(f" - Image with alt: {element.title}, url: {element.dest}") - doc.add_picture(parent=parent_element, caption=element.title) + + fig_caption: Optional[TextItem] = None + if element.title is not None and element.title != "": + fig_caption = doc.add_text( + label=DocItemLabel.CAPTION, text=element.title + ) + + doc.add_picture(parent=parent_element, caption=fig_caption) elif isinstance(element, marko.block.Paragraph): self.process_inline_text(parent_element, doc) @@ -251,27 +275,21 @@ def traverse(node): self.process_inline_text(parent_element, doc) _log.debug(f" - Code Span: {element.children}") snippet_text = str(element.children).strip() - doc.add_text( - label=DocItemLabel.CODE, parent=parent_element, text=snippet_text - ) + doc.add_code(parent=parent_element, text=snippet_text) elif isinstance(element, marko.block.CodeBlock): self.close_table(doc) self.process_inline_text(parent_element, doc) _log.debug(f" - Code Block: {element.children}") - snippet_text = str(element.children[0].children).strip() - doc.add_text( - label=DocItemLabel.CODE, parent=parent_element, text=snippet_text - ) + snippet_text = str(element.children[0].children).strip() # type: ignore + doc.add_code(parent=parent_element, text=snippet_text) elif isinstance(element, marko.block.FencedCode): self.close_table(doc) self.process_inline_text(parent_element, doc) _log.debug(f" - Code Block: {element.children}") - snippet_text = str(element.children[0].children).strip() - doc.add_text( - label=DocItemLabel.CODE, parent=parent_element, text=snippet_text - ) + snippet_text = str(element.children[0].children).strip() # type: ignore + doc.add_code(parent=parent_element, text=snippet_text) elif isinstance(element, marko.inline.LineBreak): self.process_inline_text(parent_element, doc)