diff --git a/docling/backend/md_backend.py b/docling/backend/md_backend.py index 0a083981..3e23f073 100644 --- a/docling/backend/md_backend.py +++ b/docling/backend/md_backend.py @@ -6,6 +6,7 @@ from typing import List, Optional, Set, Union import marko +import marko.element import marko.ext import marko.ext.gfm import marko.inline @@ -163,14 +164,14 @@ def process_inline_text( def iterate_elements( self, - element: marko.block.Element, + element: marko.element.Element, depth: int, doc: DoclingDocument, parent_element: Optional[NodeItem] = None, ): # Iterates over all elements in the AST # Check for different element types and process relevant details - if isinstance(element, marko.block.Heading): + if isinstance(element, marko.block.Heading) and len(element.children) > 0: self.close_table(doc) self.process_inline_text(parent_element, doc) _log.debug( @@ -205,17 +206,22 @@ def traverse(node: marko.block.BlockElement): ) elif isinstance(element, marko.block.List): + has_non_empty_list_items = False + for child in element.children: + if isinstance(child, marko.block.ListItem) and len(child.children) > 0: + has_non_empty_list_items = True + break + self.close_table(doc) self.process_inline_text(parent_element, doc) _log.debug(f" - List {'ordered' if element.ordered else 'unordered'}") - list_label = GroupLabel.LIST - if element.ordered: - list_label = GroupLabel.ORDERED_LIST - parent_element = doc.add_group( - label=list_label, name=f"list", parent=parent_element - ) + if has_non_empty_list_items: + label = GroupLabel.ORDERED_LIST if element.ordered else GroupLabel.LIST + parent_element = doc.add_group( + label=label, name=f"list", parent=parent_element + ) - elif isinstance(element, marko.block.ListItem): + elif isinstance(element, marko.block.ListItem) and len(element.children) > 0: self.close_table(doc) self.process_inline_text(parent_element, doc) _log.debug(" - List item") @@ -245,20 +251,18 @@ def traverse(node: marko.block.BlockElement): doc.add_picture(parent=parent_element, caption=fig_caption) - elif isinstance(element, marko.block.Paragraph): + elif isinstance(element, marko.block.Paragraph) and len(element.children) > 0: self.process_inline_text(parent_element, doc) elif isinstance(element, marko.inline.RawText): _log.debug(f" - Paragraph (raw text): {element.children}") - snippet_text = str(element.children).strip() + snippet_text = element.children.strip() # Detect start of the table: if "|" in snippet_text: # most likely part of the markdown table self.in_table = True if len(self.md_table_buffer) > 0: - self.md_table_buffer[len(self.md_table_buffer) - 1] += str( - snippet_text - ) + self.md_table_buffer[len(self.md_table_buffer) - 1] += snippet_text else: self.md_table_buffer.append(snippet_text) else: @@ -274,18 +278,15 @@ def traverse(node: marko.block.BlockElement): snippet_text = str(element.children).strip() doc.add_code(parent=parent_element, text=snippet_text) - elif isinstance(element, marko.block.CodeBlock): - self.close_table(doc) - self.process_inline_text(parent_element, doc) - _log.debug(f" - Code Block: {element.children}") - snippet_text = str(element.children[0].children).strip() # type: ignore - doc.add_code(parent=parent_element, text=snippet_text) - - elif isinstance(element, marko.block.FencedCode): + elif ( + isinstance(element, (marko.block.CodeBlock, marko.block.FencedCode)) + and len(element.children) > 0 + and isinstance((first_child := element.children[0]), marko.inline.RawText) + and len(snippet_text := (first_child.children.strip())) > 0 + ): self.close_table(doc) self.process_inline_text(parent_element, doc) _log.debug(f" - Code Block: {element.children}") - snippet_text = str(element.children[0].children).strip() # type: ignore doc.add_code(parent=parent_element, text=snippet_text) elif isinstance(element, marko.inline.LineBreak): @@ -309,14 +310,21 @@ def traverse(node: marko.block.BlockElement): self.close_table(doc) _log.debug("Some other element: {}".format(element)) + processed_block_types = ( + marko.block.ListItem, + marko.block.Heading, + marko.block.CodeBlock, + marko.block.FencedCode, + # marko.block.Paragraph, + marko.inline.RawText, + ) + # Iterate through the element's children (if any) - if not isinstance(element, marko.block.ListItem): - if not isinstance(element, marko.block.Heading): - if not isinstance(element, marko.block.FencedCode): - # if not isinstance(element, marko.block.Paragraph): - if hasattr(element, "children"): - for child in element.children: - self.iterate_elements(child, depth + 1, doc, parent_element) + if hasattr(element, "children") and not isinstance( + element, processed_block_types + ): + for child in element.children: + self.iterate_elements(child, depth + 1, doc, parent_element) def is_valid(self) -> bool: return self.valid diff --git a/tests/data/groundtruth/docling_v2/blocks.md.md b/tests/data/groundtruth/docling_v2/blocks.md.md new file mode 100644 index 00000000..5269e7d8 --- /dev/null +++ b/tests/data/groundtruth/docling_v2/blocks.md.md @@ -0,0 +1,33 @@ +Unordered list: + +- foo + +Empty unordered list: + +Ordered list: + +- bar + +Empty ordered list: + +Heading: + +# my heading + +Empty heading: + +Indented code block: + +``` +print("Hi!") +``` + +Empty indented code block: + +Fenced code block: + +``` +print("Hello world!") +``` + +Empty fenced code block: diff --git a/tests/data/md/blocks.md b/tests/data/md/blocks.md new file mode 100644 index 00000000..9980bb68 --- /dev/null +++ b/tests/data/md/blocks.md @@ -0,0 +1,43 @@ +Unordered list: + +- foo + +Empty unordered list: + +- + +Ordered list: + +1. bar + +Empty ordered list: + +1. + +Heading: + +# my heading + +Empty heading: + +# + +Indented code block: + + print("Hi!") + +Empty indented code block: + + + +Fenced code block: + +```python +print("Hello world!") +``` + +Empty fenced code block: + +``` + +```