diff --git a/docling/backend/mspowerpoint_backend.py b/docling/backend/mspowerpoint_backend.py index aecebdc06..8b86008bd 100644 --- a/docling/backend/mspowerpoint_backend.py +++ b/docling/backend/mspowerpoint_backend.py @@ -98,21 +98,28 @@ def convert(self) -> DoclingDocument: return doc - def generate_prov(self, shape, slide_ind, text=""): - left = shape.left - top = shape.top - width = shape.width - height = shape.height + def generate_prov( + self, shape, slide_ind, text="", slide_size=Size(width=1, height=1) + ): + if shape.left: + left = shape.left + top = shape.top + width = shape.width + height = shape.height + else: + left = 0 + top = 0 + width = slide_size.width + height = slide_size.height shape_bbox = [left, top, left + width, top + height] shape_bbox = BoundingBox.from_tuple(shape_bbox, origin=CoordOrigin.BOTTOMLEFT) - # prov = [{"bbox": shape_bbox, "page": parent_slide, "span": [0, len(text)]}] prov = ProvenanceItem( page_no=slide_ind + 1, charspan=[0, len(text)], bbox=shape_bbox ) return prov - def handle_text_elements(self, shape, parent_slide, slide_ind, doc): + def handle_text_elements(self, shape, parent_slide, slide_ind, doc, slide_size): is_a_list = False is_list_group_created = False enum_list_item_value = 0 @@ -121,7 +128,7 @@ def handle_text_elements(self, shape, parent_slide, slide_ind, doc): list_text = "" list_label = GroupLabel.LIST doc_label = DocItemLabel.LIST_ITEM - prov = self.generate_prov(shape, slide_ind, shape.text.strip()) + prov = self.generate_prov(shape, slide_ind, shape.text.strip(), slide_size) # Identify if shape contains lists for paragraph in shape.text_frame.paragraphs: @@ -270,7 +277,7 @@ def handle_title(self, shape, parent_slide, slide_ind, doc): ) return - def handle_pictures(self, shape, parent_slide, slide_ind, doc): + def handle_pictures(self, shape, parent_slide, slide_ind, doc, slide_size): # Open it with PIL try: # Get the image bytes @@ -280,7 +287,7 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc): pil_image = Image.open(BytesIO(image_bytes)) # shape has picture - prov = self.generate_prov(shape, slide_ind, "") + prov = self.generate_prov(shape, slide_ind, "", slide_size) doc.add_picture( parent=parent_slide, image=ImageRef.from_pil(image=pil_image, dpi=im_dpi), @@ -291,13 +298,13 @@ def handle_pictures(self, shape, parent_slide, slide_ind, doc): _log.warning(f"Warning: image cannot be loaded by Pillow: {e}") return - def handle_tables(self, shape, parent_slide, slide_ind, doc): + def handle_tables(self, shape, parent_slide, slide_ind, doc, slide_size): # Handling tables, images, charts if shape.has_table: table = shape.table table_xml = shape._element - prov = self.generate_prov(shape, slide_ind, "") + prov = self.generate_prov(shape, slide_ind, "", slide_size) num_cols = 0 num_rows = len(table.rows) @@ -374,17 +381,19 @@ def walk_linear(self, pptx_obj, doc) -> DoclingDocument: name=f"slide-{slide_ind}", label=GroupLabel.CHAPTER, parent=parents[0] ) - size = Size(width=slide_width, height=slide_height) - parent_page = doc.add_page(page_no=slide_ind + 1, size=size) + slide_size = Size(width=slide_width, height=slide_height) + parent_page = doc.add_page(page_no=slide_ind + 1, size=slide_size) - def handle_shapes(shape, parent_slide, slide_ind, doc): - handle_groups(shape, parent_slide, slide_ind, doc) + def handle_shapes(shape, parent_slide, slide_ind, doc, slide_size): + handle_groups(shape, parent_slide, slide_ind, doc, slide_size) if shape.has_table: # Handle Tables - self.handle_tables(shape, parent_slide, slide_ind, doc) + self.handle_tables(shape, parent_slide, slide_ind, doc, slide_size) if shape.shape_type == MSO_SHAPE_TYPE.PICTURE: # Handle Pictures - self.handle_pictures(shape, parent_slide, slide_ind, doc) + self.handle_pictures( + shape, parent_slide, slide_ind, doc, slide_size + ) # If shape doesn't have any text, move on to the next shape if not hasattr(shape, "text"): return @@ -396,16 +405,20 @@ def handle_shapes(shape, parent_slide, slide_ind, doc): _log.warning("Warning: shape has text but not text_frame") return # Handle other text elements, including lists (bullet lists, numbered lists) - self.handle_text_elements(shape, parent_slide, slide_ind, doc) + self.handle_text_elements( + shape, parent_slide, slide_ind, doc, slide_size + ) return - def handle_groups(shape, parent_slide, slide_ind, doc): + def handle_groups(shape, parent_slide, slide_ind, doc, slide_size): if shape.shape_type == MSO_SHAPE_TYPE.GROUP: for groupedshape in shape.shapes: - handle_shapes(groupedshape, parent_slide, slide_ind, doc) + handle_shapes( + groupedshape, parent_slide, slide_ind, doc, slide_size + ) # Loop through each shape in the slide for shape in slide.shapes: - handle_shapes(shape, parent_slide, slide_ind, doc) + handle_shapes(shape, parent_slide, slide_ind, doc, slide_size) return doc