From d5b2c0729559f9d9c1415101b500252c27142456 Mon Sep 17 00:00:00 2001
From: Michele Dolfi <dol@zurich.ibm.com>
Date: Tue, 21 Jan 2025 18:19:49 +0100
Subject: [PATCH] use new add_code in backends and update typing in MD backend

Signed-off-by: Michele Dolfi <dol@zurich.ibm.com>
---
 docling/backend/html_backend.py |  2 +-
 docling/backend/md_backend.py   | 70 +++++++++++++++++++++------------
 2 files changed, 45 insertions(+), 27 deletions(-)

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index ae478885..66dd4a2c 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -215,7 +215,7 @@ def handle_code(self, element, idx, doc):
         label = DocItemLabel.CODE
         if len(text) == 0:
             return
-        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+        doc.add_code(parent=self.parents[self.level], label=label, text=text)
 
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""
diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py
index 25bc3682..8171085c 100644
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@@ -3,19 +3,22 @@
 import warnings
 from io import BytesIO
 from pathlib import Path
-from typing import Set, Union
+from typing import List, Optional, Set, Union
 
 import marko
 import marko.ext
 import marko.ext.gfm
 import marko.inline
 from docling_core.types.doc import (
+    DocItem,
     DocItemLabel,
     DoclingDocument,
     DocumentOrigin,
     GroupLabel,
+    NodeItem,
     TableCell,
     TableData,
+    TextItem,
 )
 from marko import Markdown
 
@@ -27,7 +30,7 @@
 
 
 class MarkdownDocumentBackend(DeclarativeDocumentBackend):
-    def shorten_underscore_sequences(self, markdown_text, max_length=10):
+    def shorten_underscore_sequences(self, markdown_text: str, max_length: int = 10):
         # This regex will match any sequence of underscores
         pattern = r"_+"
 
@@ -89,13 +92,13 @@ def __init__(self, in_doc: "InputDocument", path_or_stream: Union[BytesIO, Path]
             ) from e
         return
 
-    def close_table(self, doc=None):
+    def close_table(self, doc: DoclingDocument):
         if self.in_table:
             _log.debug("=== TABLE START ===")
             for md_table_row in self.md_table_buffer:
                 _log.debug(md_table_row)
             _log.debug("=== TABLE END ===")
-            tcells = []
+            tcells: List[TableCell] = []
             result_table = []
             for n, md_table_row in enumerate(self.md_table_buffer):
                 data = []
@@ -136,15 +139,19 @@ def close_table(self, doc=None):
             self.in_table = False
             self.md_table_buffer = []  # clean table markdown buffer
             # Initialize Docling TableData
-            data = TableData(num_rows=num_rows, num_cols=num_cols, table_cells=tcells)
+            table_data = TableData(
+                num_rows=num_rows, num_cols=num_cols, table_cells=tcells
+            )
             # Populate
             for tcell in tcells:
-                data.table_cells.append(tcell)
+                table_data.table_cells.append(tcell)
             if len(tcells) > 0:
-                doc.add_table(data=data)
+                doc.add_table(data=table_data)
         return
 
-    def process_inline_text(self, parent_element, doc=None):
+    def process_inline_text(
+        self, parent_element: Optional[NodeItem], doc: DoclingDocument
+    ):
         # self.inline_text_buffer += str(text_in)
         txt = self.inline_text_buffer.strip()
         if len(txt) > 0:
@@ -155,14 +162,20 @@ def process_inline_text(self, parent_element, doc=None):
             )
         self.inline_text_buffer = ""
 
-    def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
+    def iterate_elements(
+        self,
+        element: marko.block.Element,
+        depth: int,
+        doc: DoclingDocument,
+        parent_element: Optional[NodeItem] = None,
+    ):
         # Iterates over all elements in the AST
         # Check for different element types and process relevant details
         if isinstance(element, marko.block.Heading):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(
-                f" - Heading level {element.level}, content: {element.children[0].children}"
+                f" - Heading level {element.level}, content: {element.children[0].children}"  # type: ignore
             )
             if element.level == 1:
                 doc_label = DocItemLabel.TITLE
@@ -171,10 +184,10 @@ def iterate_elements(self, element, depth=0, doc=None, parent_element=None):
 
             # Header could have arbitrary inclusion of bold, italic or emphasis,
             # hence we need to traverse the tree to get full text of a header
-            strings = []
+            strings: List[str] = []
 
             # Define a recursive function to traverse the tree
-            def traverse(node):
+            def traverse(node: marko.block.BlockElement):
                 # Check if the node has a "children" attribute
                 if hasattr(node, "children"):
                     # If "children" is a list, continue traversal
@@ -208,9 +221,13 @@ def traverse(node):
             self.process_inline_text(parent_element, doc)
             _log.debug(" - List item")
 
-            snippet_text = str(element.children[0].children[0].children)
+            snippet_text = str(element.children[0].children[0].children)  # type: ignore
             is_numbered = False
-            if parent_element.label == GroupLabel.ORDERED_LIST:
+            if (
+                parent_element is not None
+                and isinstance(parent_element, DocItem)
+                and parent_element.label == GroupLabel.ORDERED_LIST
+            ):
                 is_numbered = True
             doc.add_list_item(
                 enumerated=is_numbered, parent=parent_element, text=snippet_text
@@ -220,7 +237,14 @@ def traverse(node):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Image with alt: {element.title}, url: {element.dest}")
-            doc.add_picture(parent=parent_element, caption=element.title)
+
+            fig_caption: Optional[TextItem] = None
+            if element.title is not None and element.title != "":
+                fig_caption = doc.add_text(
+                    label=DocItemLabel.CAPTION, text=element.title
+                )
+
+            doc.add_picture(parent=parent_element, caption=fig_caption)
 
         elif isinstance(element, marko.block.Paragraph):
             self.process_inline_text(parent_element, doc)
@@ -251,27 +275,21 @@ def traverse(node):
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Span: {element.children}")
             snippet_text = str(element.children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.block.CodeBlock):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.block.FencedCode):
             self.close_table(doc)
             self.process_inline_text(parent_element, doc)
             _log.debug(f" - Code Block: {element.children}")
-            snippet_text = str(element.children[0].children).strip()
-            doc.add_text(
-                label=DocItemLabel.CODE, parent=parent_element, text=snippet_text
-            )
+            snippet_text = str(element.children[0].children).strip()  # type: ignore
+            doc.add_code(parent=parent_element, text=snippet_text)
 
         elif isinstance(element, marko.inline.LineBreak):
             self.process_inline_text(parent_element, doc)