From 1239ade2750349d13d4e865d88449b232bbad944 Mon Sep 17 00:00:00 2001
From: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
Date: Mon, 11 Nov 2024 14:49:06 +0100
Subject: [PATCH 1/3] docs: add navigation indices (#305)

Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com>
---
 docs/concepts/index.md     | 3 +++
 docs/examples/index.md     | 3 +++
 docs/integrations/index.md | 3 +++
 mkdocs.yml                 | 5 ++++-
 4 files changed, 13 insertions(+), 1 deletion(-)
 create mode 100644 docs/concepts/index.md
 create mode 100644 docs/examples/index.md
 create mode 100644 docs/integrations/index.md
diff --git a/docs/concepts/index.md b/docs/concepts/index.md
new file mode 100644
index 000000000..a29db1afe
--- /dev/null
+++ b/docs/concepts/index.md
@@ -0,0 +1,3 @@
+In this area you can find guides on the main Docling concepts.
+
+Use the navigation on the left to browse through them.
diff --git a/docs/examples/index.md b/docs/examples/index.md
new file mode 100644
index 000000000..5c2d3acd5
--- /dev/null
+++ b/docs/examples/index.md
@@ -0,0 +1,3 @@
+In this area you can find examples covering a range of possible workflows and use cases.
+
+Use the navigation on the left to browse through them.
diff --git a/docs/integrations/index.md b/docs/integrations/index.md
new file mode 100644
index 000000000..c09c917d2
--- /dev/null
+++ b/docs/integrations/index.md
@@ -0,0 +1,3 @@
+In this area you can find guides on the Docling integrations with popular frameworks and tools.
+
+Use the navigation on the left to browse through them.
diff --git a/mkdocs.yml b/mkdocs.yml
index 1fef4428d..2ce244bd7 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -39,7 +39,7 @@ theme:
     - content.code.copy
     - announce.dismiss
     - navigation.tabs
-    # - navigation.indexes  # <= if set, each "section" can have its own page, if index.md is used
+    - navigation.indexes  # <= if set, each "section" can have its own page, if index.md is used
     - navigation.instant
     - navigation.instant.prefetch
     # - navigation.instant.preview
@@ -57,9 +57,11 @@ nav:
     - Usage: usage.md
     - Docling v2: v2.md
   - Concepts:
+    - Concepts: concepts/index.md
     - Docling Document: concepts/docling_document.md
   #   - Chunking: concepts/chunking.md
   - Examples:
+    - Examples: examples/index.md
     - Conversion:
       - "Simple conversion": examples/minimal.py
       - "Custom conversion": examples/custom_convert.py
@@ -77,6 +79,7 @@ nav:
     # - CLI:
     #   - CLI: examples/cli.md
   - Integrations:
+    - Integrations: integrations/index.md
     - "LlamaIndex 🦙 extension": integrations/llamaindex.md
     # - "LangChain 🦜🔗 extension": integrations/langchain.md
   # - API reference:

From 53bf2d179097c78968083bd7bbc1f1fddc897272 Mon Sep 17 00:00:00 2001
From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:00:11 +0100
Subject: [PATCH 2/3] Added handling of code blocks in html with <pre> tag
 (#302)

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
---
 docling/backend/html_backend.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 7d14c2ebf..9cd1e29b9 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -120,6 +120,8 @@ def analyse_element(self, element, idx, doc):
             self.handle_header(element, idx, doc)
         elif element.name in ["p"]:
             self.handle_paragraph(element, idx, doc)
+        elif element.name in ["pre"]:
+            self.handle_code(element, idx, doc)
         elif element.name in ["ul", "ol"]:
             self.handle_list(element, idx, doc)
         elif element.name in ["li"]:
@@ -205,6 +207,16 @@ def handle_header(self, element, idx, doc):
                 level=hlevel,
             )
 
+    def handle_code(self, element, idx, doc):
+        """Handles monospace code snippets (pre)."""
+        if element.text is None:
+            return
+        text = element.text.strip()
+        label = DocItemLabel.CODE
+        if len(text) == 0:
+            return
+        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""
         if element.text is None:

From 81c8243a8bf177feed8f87ea283b5bb6836350cb Mon Sep 17 00:00:00 2001
From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com>
Date: Mon, 11 Nov 2024 16:38:21 +0100
Subject: [PATCH 3/3] fix: Added handling of grouped elements in pptx backend
 (#307)

* Added handling of grouped elements in pptx backend

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

* updated log.warn to warning

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>

---------

Signed-off-by: Maksym Lysak <mly@zurich.ibm.com>
Co-authored-by: Maksym Lysak <mly@zurich.ibm.com>
---
 docling/backend/mspowerpoint_backend.py | 39 +++++++++++--------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index cbec761c5..b71cd859d 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -358,41 +358,36 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
 
             size = Size(width=slide_width, height=slide_height)
             parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
-            # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
-
-            # Loop through each shape in the slide
-            for shape in slide.shapes:
 
+            def handle_shapes(shape, parent_slide, slide_ind, doc):
+                handle_groups(shape, parent_slide, slide_ind, doc)
                 if shape.has_table:
                     # Handle Tables
                     self.handle_tables(shape, parent_slide, slide_ind, doc)
-
                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                    # Handle Tables
+                    # Handle Pictures
                     self.handle_pictures(shape, parent_slide, slide_ind, doc)
-
                 # If shape doesn't have any text, move on to the next shape
                 if not hasattr(shape, "text"):
-                    continue
+                    return
                 if shape.text is None:
-                    continue
+                    return
                 if len(shape.text.strip()) == 0:
-                    continue
+                    return
                 if not shape.has_text_frame:
-                    _log.warn("Warning: shape has text but not text_frame")
-                    continue
-
-                # if shape.is_placeholder:
-                # Handle Titles (Headers) and Subtitles
-                # Check if the shape is a placeholder (titles are placeholders)
-                # self.handle_title(shape, parent_slide, slide_ind, doc)
-                # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
-                # else:
-
+                    _log.warning("Warning: shape has text but not text_frame")
+                    return
                 # Handle other text elements, including lists (bullet lists, numbered lists)
                 self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                return
+
+            def handle_groups(shape, parent_slide, slide_ind, doc):
+                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+                    for groupedshape in shape.shapes:
+                        handle_shapes(groupedshape, parent_slide, slide_ind, doc)
 
-                # figures...
-                # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
+            # Loop through each shape in the slide
+            for shape in slide.shapes:
+                handle_shapes(shape, parent_slide, slide_ind, doc)
 
         return doc