diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 7d14c2ebf..9cd1e29b9 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -120,6 +120,8 @@ def analyse_element(self, element, idx, doc):
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
+ elif element.name in ["pre"]:
+ self.handle_code(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
@@ -205,6 +207,16 @@ def handle_header(self, element, idx, doc):
level=hlevel,
)
+ def handle_code(self, element, idx, doc):
+ """Handles monospace code snippets (pre)."""
+ if element.text is None:
+ return
+ text = element.text.strip()
+ label = DocItemLabel.CODE
+ if len(text) == 0:
+ return
+ doc.add_text(parent=self.parents[self.level], label=label, text=text)
+
def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index cbec761c5..b71cd859d 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -358,41 +358,36 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
- # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
-
- # Loop through each shape in the slide
- for shape in slide.shapes:
+ def handle_shapes(shape, parent_slide, slide_ind, doc):
+ handle_groups(shape, parent_slide, slide_ind, doc)
if shape.has_table:
# Handle Tables
self.handle_tables(shape, parent_slide, slide_ind, doc)
-
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
- # Handle Tables
+ # Handle Pictures
self.handle_pictures(shape, parent_slide, slide_ind, doc)
-
# If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"):
- continue
+ return
if shape.text is None:
- continue
+ return
if len(shape.text.strip()) == 0:
- continue
+ return
if not shape.has_text_frame:
- _log.warn("Warning: shape has text but not text_frame")
- continue
-
- # if shape.is_placeholder:
- # Handle Titles (Headers) and Subtitles
- # Check if the shape is a placeholder (titles are placeholders)
- # self.handle_title(shape, parent_slide, slide_ind, doc)
- # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
- # else:
-
+ _log.warning("Warning: shape has text but not text_frame")
+ return
# Handle other text elements, including lists (bullet lists, numbered lists)
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+ return
+
+ def handle_groups(shape, parent_slide, slide_ind, doc):
+ if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+ for groupedshape in shape.shapes:
+ handle_shapes(groupedshape, parent_slide, slide_ind, doc)
- # figures...
- # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
+ # Loop through each shape in the slide
+ for shape in slide.shapes:
+ handle_shapes(shape, parent_slide, slide_ind, doc)
return doc
diff --git a/docs/concepts/index.md b/docs/concepts/index.md
new file mode 100644
index 000000000..a29db1afe
--- /dev/null
+++ b/docs/concepts/index.md
@@ -0,0 +1,3 @@
+In this area you can find guides on the main Docling concepts.
+
+Use the navigation on the left to browse through them.
diff --git a/docs/examples/index.md b/docs/examples/index.md
new file mode 100644
index 000000000..5c2d3acd5
--- /dev/null
+++ b/docs/examples/index.md
@@ -0,0 +1,3 @@
+In this area you can find examples covering a range of possible workflows and use cases.
+
+Use the navigation on the left to browse through them.
diff --git a/docs/integrations/index.md b/docs/integrations/index.md
new file mode 100644
index 000000000..c09c917d2
--- /dev/null
+++ b/docs/integrations/index.md
@@ -0,0 +1,3 @@
+In this area you can find guides on the Docling integrations with popular frameworks and tools.
+
+Use the navigation on the left to browse through them.
diff --git a/mkdocs.yml b/mkdocs.yml
index 9ca6fdeb7..25eb48f44 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -39,7 +39,7 @@ theme:
- content.code.copy
- announce.dismiss
- navigation.tabs
- # - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
+ - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
- navigation.instant
- navigation.instant.prefetch
# - navigation.instant.preview
@@ -57,9 +57,11 @@ nav:
- Usage: usage.md
- Docling v2: v2.md
- Concepts:
+ - Concepts: concepts/index.md
- Docling Document: concepts/docling_document.md
# - Chunking: concepts/chunking.md
- Examples:
+ - Examples: examples/index.md
- Conversion:
- "Simple conversion": examples/minimal.py
- "Custom conversion": examples/custom_convert.py
@@ -78,6 +80,7 @@ nav:
# - CLI:
# - CLI: examples/cli.md
- Integrations:
+ - Integrations: integrations/index.md
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
# - "LangChain 🦜🔗 extension": integrations/langchain.md
# - API reference: