From 1239ade2750349d13d4e865d88449b232bbad944 Mon Sep 17 00:00:00 2001 From: Panos Vagenas <35837085+vagenas@users.noreply.github.com> Date: Mon, 11 Nov 2024 14:49:06 +0100 Subject: [PATCH 1/3] docs: add navigation indices (#305) Signed-off-by: Panos Vagenas <35837085+vagenas@users.noreply.github.com> --- docs/concepts/index.md | 3 +++ docs/examples/index.md | 3 +++ docs/integrations/index.md | 3 +++ mkdocs.yml | 5 ++++- 4 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 docs/concepts/index.md create mode 100644 docs/examples/index.md create mode 100644 docs/integrations/index.md diff --git a/docs/concepts/index.md b/docs/concepts/index.md new file mode 100644 index 000000000..a29db1afe --- /dev/null +++ b/docs/concepts/index.md @@ -0,0 +1,3 @@ +In this area you can find guides on the main Docling concepts. + +Use the navigation on the left to browse through them. diff --git a/docs/examples/index.md b/docs/examples/index.md new file mode 100644 index 000000000..5c2d3acd5 --- /dev/null +++ b/docs/examples/index.md @@ -0,0 +1,3 @@ +In this area you can find examples covering a range of possible workflows and use cases. + +Use the navigation on the left to browse through them. diff --git a/docs/integrations/index.md b/docs/integrations/index.md new file mode 100644 index 000000000..c09c917d2 --- /dev/null +++ b/docs/integrations/index.md @@ -0,0 +1,3 @@ +In this area you can find guides on the Docling integrations with popular frameworks and tools. + +Use the navigation on the left to browse through them. diff --git a/mkdocs.yml b/mkdocs.yml index 1fef4428d..2ce244bd7 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -39,7 +39,7 @@ theme: - content.code.copy - announce.dismiss - navigation.tabs - # - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used + - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used - navigation.instant - navigation.instant.prefetch # - navigation.instant.preview @@ -57,9 +57,11 @@ nav: - Usage: usage.md - Docling v2: v2.md - Concepts: + - Concepts: concepts/index.md - Docling Document: concepts/docling_document.md # - Chunking: concepts/chunking.md - Examples: + - Examples: examples/index.md - Conversion: - "Simple conversion": examples/minimal.py - "Custom conversion": examples/custom_convert.py @@ -77,6 +79,7 @@ nav: # - CLI: # - CLI: examples/cli.md - Integrations: + - Integrations: integrations/index.md - "LlamaIndex 🦙 extension": integrations/llamaindex.md # - "LangChain 🦜🔗 extension": integrations/langchain.md # - API reference: From 53bf2d179097c78968083bd7bbc1f1fddc897272 Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Mon, 11 Nov 2024 15:00:11 +0100 Subject: [PATCH 2/3] Added handling of code blocks in html with
tag (#302) Signed-off-by: Maksym LysakCo-authored-by: Maksym Lysak --- docling/backend/html_backend.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docling/backend/html_backend.py b/docling/backend/html_backend.py index 7d14c2ebf..9cd1e29b9 100644 --- a/docling/backend/html_backend.py +++ b/docling/backend/html_backend.py @@ -120,6 +120,8 @@ def analyse_element(self, element, idx, doc): self.handle_header(element, idx, doc) elif element.name in ["p"]: self.handle_paragraph(element, idx, doc) + elif element.name in ["pre"]: + self.handle_code(element, idx, doc) elif element.name in ["ul", "ol"]: self.handle_list(element, idx, doc) elif element.name in ["li"]: @@ -205,6 +207,16 @@ def handle_header(self, element, idx, doc): level=hlevel, ) + def handle_code(self, element, idx, doc): + """Handles monospace code snippets (pre).""" + if element.text is None: + return + text = element.text.strip() + label = DocItemLabel.CODE + if len(text) == 0: + return + doc.add_text(parent=self.parents[self.level], label=label, text=text) + def handle_paragraph(self, element, idx, doc): """Handles paragraph tags (p).""" if element.text is None: From 81c8243a8bf177feed8f87ea283b5bb6836350cb Mon Sep 17 00:00:00 2001 From: Maxim Lysak <101627549+maxmnemonic@users.noreply.github.com> Date: Mon, 11 Nov 2024 16:38:21 +0100 Subject: [PATCH 3/3] fix: Added handling of grouped elements in pptx backend (#307) * Added handling of grouped elements in pptx backend Signed-off-by: Maksym Lysak * updated log.warn to warning Signed-off-by: Maksym Lysak --------- Signed-off-by: Maksym Lysak Co-authored-by: Maksym Lysak --- docling/backend/mspowerpoint_backend.py | 39 +++++++++++-------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index cbec761c5..b71cd859d 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -358,41 +358,36 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument: size = Size(width=slide_width, height=slide_height) parent_page = doc.add_page(page_no=slide_ind + 1, size=size) - # parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash) - - # Loop through each shape in the slide - for shape in slide.shapes: + def handle_shapes(shape, parent_slide, slide_ind, doc): + handle_groups(shape, parent_slide, slide_ind, doc) if shape.has_table: # Handle Tables self.handle_tables(shape, parent_slide, slide_ind, doc) - if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: - # Handle Tables + # Handle Pictures self.handle_pictures(shape, parent_slide, slide_ind, doc) - # If shape doesn't have any text, move on to the next shape if not hasattr(shape, "text"): - continue + return if shape.text is None: - continue + return if len(shape.text.strip()) == 0: - continue + return if not shape.has_text_frame: - _log.warn("Warning: shape has text but not text_frame") - continue - - # if shape.is_placeholder: - # Handle Titles (Headers) and Subtitles - # Check if the shape is a placeholder (titles are placeholders) - # self.handle_title(shape, parent_slide, slide_ind, doc) - # self.handle_text_elements(shape, parent_slide, slide_ind, doc) - # else: - + _log.warning("Warning: shape has text but not text_frame") + return # Handle other text elements, including lists (bullet lists, numbered lists) self.handle_text_elements(shape, parent_slide, slide_ind, doc) + return + + def handle_groups(shape, parent_slide, slide_ind, doc): + if shape.shape_type == MSO_SHAPE_TYPE.GROUP: + for groupedshape in shape.shapes: + handle_shapes(groupedshape, parent_slide, slide_ind, doc) - # figures... - # doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None) + # Loop through each shape in the slide + for shape in slide.shapes: + handle_shapes(shape, parent_slide, slide_ind, doc) return doc