Skip to content

Commit

Permalink
Merge branch 'main' into force_ocr
Browse files Browse the repository at this point in the history
  • Loading branch information
nikos-livathinos committed Nov 11, 2024
2 parents 7a0f160 + 81c8243 commit 088ce5f
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 23 deletions.
12 changes: 12 additions & 0 deletions docling/backend/html_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ def analyse_element(self, element, idx, doc):
self.handle_header(element, idx, doc)
elif element.name in ["p"]:
self.handle_paragraph(element, idx, doc)
elif element.name in ["pre"]:
self.handle_code(element, idx, doc)
elif element.name in ["ul", "ol"]:
self.handle_list(element, idx, doc)
elif element.name in ["li"]:
Expand Down Expand Up @@ -205,6 +207,16 @@ def handle_header(self, element, idx, doc):
level=hlevel,
)

def handle_code(self, element, idx, doc):
"""Handles monospace code snippets (pre)."""
if element.text is None:
return
text = element.text.strip()
label = DocItemLabel.CODE
if len(text) == 0:
return
doc.add_text(parent=self.parents[self.level], label=label, text=text)

def handle_paragraph(self, element, idx, doc):
"""Handles paragraph tags (p)."""
if element.text is None:
Expand Down
39 changes: 17 additions & 22 deletions docling/backend/mspowerpoint_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,41 +358,36 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument:

size = Size(width=slide_width, height=slide_height)
parent_page = doc.add_page(page_no=slide_ind + 1, size=size)
# parent_page = doc.add_page(page_no=slide_ind, size=size, hash=hash)

# Loop through each shape in the slide
for shape in slide.shapes:

def handle_shapes(shape, parent_slide, slide_ind, doc):
handle_groups(shape, parent_slide, slide_ind, doc)
if shape.has_table:
# Handle Tables
self.handle_tables(shape, parent_slide, slide_ind, doc)

if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
# Handle Tables
# Handle Pictures
self.handle_pictures(shape, parent_slide, slide_ind, doc)

# If shape doesn't have any text, move on to the next shape
if not hasattr(shape, "text"):
continue
return
if shape.text is None:
continue
return
if len(shape.text.strip()) == 0:
continue
return
if not shape.has_text_frame:
_log.warn("Warning: shape has text but not text_frame")
continue

# if shape.is_placeholder:
# Handle Titles (Headers) and Subtitles
# Check if the shape is a placeholder (titles are placeholders)
# self.handle_title(shape, parent_slide, slide_ind, doc)
# self.handle_text_elements(shape, parent_slide, slide_ind, doc)
# else:

_log.warning("Warning: shape has text but not text_frame")
return
# Handle other text elements, including lists (bullet lists, numbered lists)
self.handle_text_elements(shape, parent_slide, slide_ind, doc)
return

def handle_groups(shape, parent_slide, slide_ind, doc):
if shape.shape_type == MSO_SHAPE_TYPE.GROUP:
for groupedshape in shape.shapes:
handle_shapes(groupedshape, parent_slide, slide_ind, doc)

# figures...
# doc.add_figure(data=BaseFigureData(), parent=self.parents[self.level], caption=None)
# Loop through each shape in the slide
for shape in slide.shapes:
handle_shapes(shape, parent_slide, slide_ind, doc)

return doc
3 changes: 3 additions & 0 deletions docs/concepts/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
In this area you can find guides on the main Docling concepts.

Use the navigation on the left to browse through them.
3 changes: 3 additions & 0 deletions docs/examples/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
In this area you can find examples covering a range of possible workflows and use cases.

Use the navigation on the left to browse through them.
3 changes: 3 additions & 0 deletions docs/integrations/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
In this area you can find guides on the Docling integrations with popular frameworks and tools.

Use the navigation on the left to browse through them.
5 changes: 4 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ theme:
- content.code.copy
- announce.dismiss
- navigation.tabs
# - navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
- navigation.indexes # <= if set, each "section" can have its own page, if index.md is used
- navigation.instant
- navigation.instant.prefetch
# - navigation.instant.preview
Expand All @@ -57,9 +57,11 @@ nav:
- Usage: usage.md
- Docling v2: v2.md
- Concepts:
- Concepts: concepts/index.md
- Docling Document: concepts/docling_document.md
# - Chunking: concepts/chunking.md
- Examples:
- Examples: examples/index.md
- Conversion:
- "Simple conversion": examples/minimal.py
- "Custom conversion": examples/custom_convert.py
Expand All @@ -78,6 +80,7 @@ nav:
# - CLI:
# - CLI: examples/cli.md
- Integrations:
- Integrations: integrations/index.md
- "LlamaIndex 🦙 extension": integrations/llamaindex.md
# - "LangChain 🦜🔗 extension": integrations/langchain.md
# - API reference:
Expand Down

0 comments on commit 088ce5f

Please sign in to comment.