From 1239ade2750349d13d4e865d88449b232bbad944 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 11 Nov 2024 14:49:06 +0100 Subject: [PATCH 1/3] docs: add navigation indices (#305) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/concepts/index.md | 3 +++ docs/examples/index.md | 3 +++ docs/integrations/index.md | 3 +++ mkdocs.yml | 5 ++++- 4 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 docs/concepts/index.md create mode 100644 docs/examples/index.md create mode 100644 docs/integrations/index.md diff --git a/docs/concepts/index.md b/docs/concepts/index.md new file mode 100644 index 000000000..a29db1afe --- /dev/null +++ b/docs/concepts/index.md @@ -0,0 +1,3 @@ +In this area you can find guides on the main Docling concepts. + +Use the navigation on the left to browse through them. diff --git a/docs/examples/index.md b/docs/examples/index.md new file mode 100644 index 000000000..5c2d3acd5 --- /dev/null +++ b/docs/examples/index.md @@ -0,0 +1,3 @@ +In this area you can find examples covering a range of possible workflows and use cases. + +Use the navigation on the left to browse through them. diff --git a/docs/integrations/index.md b/docs/integrations/index.md new file mode 100644 index 000000000..c09c917d2 --- /dev/null +++ b/docs/integrations/index.md @@ -0,0 +1,3 @@ +In this area you can find guides on the Docling integrations with popular frameworks and tools. + +Use the navigation on the left to browse through them. diff --git a/mkdocs.yml b/mkdocs.yml index 1fef4428d..2ce244bd7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -39,7 +39,7 @@ theme: - content.code.copy - announce.dismiss - navigation.tabs - # - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used + - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used - navigation.instant - navigation.instant.prefetch # - navigation.instant.preview @@ -57,9 +57,11 @@ nav: - Usage: usage.md - Docling v2: v2.md - Concepts: + - Concepts: concepts/index.md - Docling Document: concepts/docling_document.md # - Chunking: concepts/chunking.md - Examples: + - Examples: examples/index.md - Conversion: - "Simple conversion": examples/minimal.py - "Custom conversion": examples/custom_convert.py @@ -77,6 +79,7 @@ nav: # - CLI: # - CLI: examples/cli.md - Integrations: + - Integrations: integrations/index.md - "LlamaIndex 🦙 extension": integrations/llamaindex.md # - "LangChain 🦜🔗 extension": integrations/langchain.md # - API reference: From 53bf2d179097c78968083bd7bbc1f1fddc897272 Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:00:11 +0100 Subject: [PATCH 2/3] Added handling of code blocks in html with
 tag
 (#302)

Signed-off-by: Maksym Lysak 
Co-authored-by: Maksym Lysak 
---
 docling/backend/html_backend.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py
index 7d14c2ebf..9cd1e29b9 100644
--- a/docling/backend/html_backend.py
+++ b/docling/backend/html_backend.py
@@ -120,6 +120,8 @@ def analyse_element(self, element, idx, doc):
             self.handle_header(element, idx, doc)
         elif element.name in ["p"]:
             self.handle_paragraph(element, idx, doc)
+        elif element.name in ["pre"]:
+            self.handle_code(element, idx, doc)
         elif element.name in ["ul", "ol"]:
             self.handle_list(element, idx, doc)
         elif element.name in ["li"]:
@@ -205,6 +207,16 @@ def handle_header(self, element, idx, doc):
                 level=hlevel,
             )
 
+    def handle_code(self, element, idx, doc):
+        """Handles monospace code snippets (pre)."""
+        if element.text is None:
+            return
+        text = element.text.strip()
+        label = DocItemLabel.CODE
+        if len(text) == 0:
+            return
+        doc.add_text(parent=self.parents[self.level], label=label, text=text)
+
     def handle_paragraph(self, element, idx, doc):
         """Handles paragraph tags (p)."""
         if element.text is None:

From 81c8243a8bf177feed8f87ea283b5bb6836350cb Mon Sep 17 00:00:00 2001
From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com>
Date: Mon, 11 Nov 2024 16:38:21 +0100
Subject: [PATCH 3/3] fix: Added handling of grouped elements in pptx backend
 (#307)

* Added handling of grouped elements in pptx backend

Signed-off-by: Maksym Lysak 

* updated log.warn to warning

Signed-off-by: Maksym Lysak 

---------

Signed-off-by: Maksym Lysak 
Co-authored-by: Maksym Lysak 
---
 docling/backend/mspowerpoint_backend.py | 39 +++++++++++--------------
 1 file changed, 17 insertions(+), 22 deletions(-)

diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py
index cbec761c5..b71cd859d 100644
--- a/docling/backend/mspowerpoint_backend.py
+++ b/docling/backend/mspowerpoint_backend.py
@@ -358,41 +358,36 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:
 
             size = Size(width=slide_width, height=slide_height)
             parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
-            # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)
-
-            # Loop through each shape in the slide
-            for shape in slide.shapes:
 
+            def handle_shapes(shape, parent_slide, slide_ind, doc):
+                handle_groups(shape, parent_slide, slide_ind, doc)
                 if shape.has_table:
                     # Handle Tables
                     self.handle_tables(shape, parent_slide, slide_ind, doc)
-
                 if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
-                    # Handle Tables
+                    # Handle Pictures
                     self.handle_pictures(shape, parent_slide, slide_ind, doc)
-
                 # If shape doesn't have any text, move on to the next shape
                 if not hasattr(shape, "text"):
-                    continue
+                    return
                 if shape.text is None:
-                    continue
+                    return
                 if len(shape.text.strip()) == 0:
-                    continue
+                    return
                 if not shape.has_text_frame:
-                    _log.warn("Warning: shape has text but not text_frame")
-                    continue
-
-                # if shape.is_placeholder:
-                # Handle Titles (Headers) and Subtitles
-                # Check if the shape is a placeholder (titles are placeholders)
-                # self.handle_title(shape, parent_slide, slide_ind, doc)
-                # self.handle_text_elements(shape, parent_slide, slide_ind, doc)
-                # else:
-
+                    _log.warning("Warning: shape has text but not text_frame")
+                    return
                 # Handle other text elements, including lists (bullet lists, numbered lists)
                 self.handle_text_elements(shape, parent_slide, slide_ind, doc)
+                return
+
+            def handle_groups(shape, parent_slide, slide_ind, doc):
+                if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
+                    for groupedshape in shape.shapes:
+                        handle_shapes(groupedshape, parent_slide, slide_ind, doc)
 
-                # figures...
-                # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
+            # Loop through each shape in the slide
+            for shape in slide.shapes:
+                handle_shapes(shape, parent_slide, slide_ind, doc)
 
         return doc