Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 26 additions & 2 deletions ms_agent/tools/docling/doc_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.document_converter import (DocumentConverter, PdfFormatOption,
PowerpointFormatOption,
WordFormatOption)
from docling.models.document_picture_classifier import \
DocumentPictureClassifier
from docling.models.layout_model import LayoutModel
from docling.models.table_structure_model import TableStructureModel
from docling_core.types import DoclingDocument
from docling_core.types.doc import DocItem
from ms_agent.tools.docling.doc_postprocess import PostProcess
from ms_agent.tools.docling.enrich_pipeline import (DocPipelineOptions,
EnrichDocPipeline)
from ms_agent.tools.docling.patches import (download_models_ms,
download_models_pic_classifier_ms,
html_handle_figure,
Expand Down Expand Up @@ -55,10 +59,30 @@ def __init__(self, verbose: bool = False):
pdf_pipeline_options.images_scale = 2.0
pdf_pipeline_options.accelerator_options = accelerator_options # type: ignore

word_pipeline_options = DocPipelineOptions()
word_pipeline_options.do_picture_classification = True
word_pipeline_options.do_code_enrichment = False
word_pipeline_options.do_formula_enrichment = False
word_pipeline_options.accelerator_options = accelerator_options # type: ignore

ppt_pipeline_options = DocPipelineOptions()
ppt_pipeline_options.do_picture_classification = True
ppt_pipeline_options.do_code_enrichment = False
ppt_pipeline_options.do_formula_enrichment = False
ppt_pipeline_options.accelerator_options = accelerator_options # type: ignore

self._converter = DocumentConverter(
format_options={
InputFormat.PDF:
PdfFormatOption(pipeline_options=pdf_pipeline_options)
PdfFormatOption(pipeline_options=pdf_pipeline_options),
InputFormat.DOCX:
WordFormatOption(
pipeline_cls=EnrichDocPipeline,
pipeline_options=word_pipeline_options),
InputFormat.PPTX:
PowerpointFormatOption(
pipeline_cls=EnrichDocPipeline,
pipeline_options=ppt_pipeline_options)
})

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The initialization of word_pipeline_options and ppt_pipeline_options is duplicated. You can create a single doc_pipeline_options object and reuse it for both DOCX and PPTX formats to improve maintainability and reduce code duplication.

        doc_pipeline_options = DocPipelineOptions()
        doc_pipeline_options.do_picture_classification = True
        doc_pipeline_options.do_code_enrichment = False
        doc_pipeline_options.do_formula_enrichment = False
        doc_pipeline_options.accelerator_options = accelerator_options  # type: ignore

        self._converter = DocumentConverter(
            format_options={
                InputFormat.PDF:
                PdfFormatOption(pipeline_options=pdf_pipeline_options),
                InputFormat.DOCX:
                WordFormatOption(
                    pipeline_cls=EnrichDocPipeline,
                    pipeline_options=doc_pipeline_options),
                InputFormat.PPTX:
                PowerpointFormatOption(
                    pipeline_cls=EnrichDocPipeline,
                    pipeline_options=doc_pipeline_options)
            })

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.


@staticmethod
Expand Down
10 changes: 10 additions & 0 deletions ms_agent/tools/docling/doc_postprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,14 @@ def filter(doc: DoclingDocument) -> Union[DoclingDocument, None]:
if pic_item.image.size.height * pic_item.image.size.width < PostProcess.MIN_PICTURE_SIZE:
pic_item.image = None # Remove image if too small

if hasattr(pic_item, 'annotations') and pic_item.annotations:
pic_classes = getattr(pic_item.annotations[0],
'predicted_classes', None)
pic_class = pic_classes[
0].class_name if pic_classes else None # Get the first predicted class if available
if pic_class is not None and pic_class.lower() in [
'qr_code', 'logo', 'icon'
]:

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Using a set for membership testing is more efficient than a list. Also, these magic strings should be defined as a constant at the class level for better maintainability, e.g., FILTERED_IMAGE_LABELS = {'qr_code', 'logo', 'icon'}.

                if pic_class is not None and pic_class.lower() in {'qr_code', 'logo', 'icon'}:

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

pic_item.image = None

return doc
132 changes: 132 additions & 0 deletions ms_agent/tools/docling/enrich_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# yapf: disable
from pathlib import Path
from typing import Iterable, List, Optional, Union

import numpy as np
from docling.datamodel.accelerator_options import AcceleratorOptions
from docling.datamodel.pipeline_options import PipelineOptions
from docling.datamodel.settings import settings
from docling.models.code_formula_model import (CodeFormulaModel,
CodeFormulaModelOptions)
from docling.models.document_picture_classifier import (
DocumentPictureClassifier, DocumentPictureClassifierOptions)
from docling.pipeline.simple_pipeline import SimplePipeline
from docling_core.types.doc import (DoclingDocument, NodeItem,
PictureClassificationClass,
PictureClassificationData, PictureItem)
from PIL import Image


class DocPipelineOptions(PipelineOptions):
"""Options for processing Word and PPT documents in the pipeline."""

artifacts_path: Optional[Union[Path, str]] = None
do_picture_classification: bool = False # True: classify pictures in documents
do_code_enrichment: bool = False # True: perform code OCR
do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code


class EnrichDocumentPictureClassifier(DocumentPictureClassifier):
"""
Specializes DocumentPictureClassifier for robust pipeline processing.

This classifier is designed to handle document formats like Word and PPT where
images might be missing or invalid. It overrides the default behavior to
skip elements with unreadable images instead of raising an error, thus
preventing the entire processing pipeline from halting.
"""

def __init__(self, enabled: bool, artifacts_path: Optional[Path],
options: DocumentPictureClassifierOptions,
accelerator_options: AcceleratorOptions):
super().__init__(enabled, artifacts_path, options, accelerator_options)

def __call__(
self,
doc: DoclingDocument,
element_batch: Iterable[NodeItem],
) -> Iterable[NodeItem]:
"""
This method iterates through a batch of elements, extracts their images,
and applies the picture classification model. Unlike the base class
implementation, it gracefully handles cases where an image cannot be
retrieved (i.e., `get_image()` returns None) by skipping that element.
This ensures that a single faulty item does not stop the entire batch
processing.
"""
if not self.enabled:
for element in element_batch:
yield element
return

images: List[Union[Image.Image, np.ndarray]] = []
elements_with_images: List[PictureItem] = []
element_batch_list: List[PictureItem] = list(element_batch)
for el in element_batch_list:
assert isinstance(
el, PictureItem), f'Element {el} is not a PictureItem'
img = el.get_image(doc)
if img is not None:
images.append(img)
elements_with_images.append(el)

if images:
outputs = self.document_picture_classifier.predict(images)
for element, output in zip(elements_with_images, outputs):
element.annotations.append(
PictureClassificationData(
provenance='DocumentPictureClassifier',
predicted_classes=[
PictureClassificationClass(
class_name=pred[0],
confidence=pred[1],
) for pred in output
],
))

for element in element_batch_list:
yield element


class EnrichDocPipeline(SimplePipeline):
"""Pipeline for enriching Word and PPT documents with additional processing steps."""

def __init__(self, pipeline_options: DocPipelineOptions):
super().__init__(pipeline_options)

artifacts_path: Optional[Path] = None
if pipeline_options.artifacts_path is not None:
artifacts_path = Path(pipeline_options.artifacts_path).expanduser()
elif settings.artifacts_path is not None:
artifacts_path = Path(settings.artifacts_path).expanduser()

if artifacts_path is not None and not artifacts_path.is_dir():
raise RuntimeError(
f'The value of {artifacts_path=} is not valid. '
'When defined, it must point to a folder containing all models required by the pipeline.'
)

self.enrichment_pipe = [
# Code Formula Enrichment Model
CodeFormulaModel(
enabled=pipeline_options.do_code_enrichment
or pipeline_options.do_formula_enrichment,
artifacts_path=artifacts_path,
options=CodeFormulaModelOptions(
do_code_enrichment=pipeline_options.do_code_enrichment,
do_formula_enrichment=pipeline_options.do_formula_enrichment,
),
accelerator_options=pipeline_options.accelerator_options,
),
# Document Picture Classifier
EnrichDocumentPictureClassifier(
enabled=pipeline_options.do_picture_classification,
artifacts_path=artifacts_path,
options=DocumentPictureClassifierOptions(),
accelerator_options=pipeline_options.accelerator_options,
)
]

@classmethod
def get_default_options(cls) -> DocPipelineOptions:
return DocPipelineOptions()
Loading