From 097fc7002f340d5b2a13d661681290e2f6204a1c Mon Sep 17 00:00:00 2001 From: alcholiclg Date: Thu, 14 Aug 2025 17:44:55 +0800 Subject: [PATCH 1/3] support image classification in docx and pptx --- ms_agent/tools/docling/doc_loader.py | 28 ++++- ms_agent/tools/docling/enrich_pipeline.py | 132 ++++++++++++++++++++++ 2 files changed, 158 insertions(+), 2 deletions(-) create mode 100644 ms_agent/tools/docling/enrich_pipeline.py diff --git a/ms_agent/tools/docling/doc_loader.py b/ms_agent/tools/docling/doc_loader.py index a19a03df5..2d7833752 100644 --- a/ms_agent/tools/docling/doc_loader.py +++ b/ms_agent/tools/docling/doc_loader.py @@ -7,7 +7,9 @@ from docling.datamodel.base_models import InputFormat from docling.datamodel.document import ConversionResult from docling.datamodel.pipeline_options import PdfPipelineOptions -from docling.document_converter import DocumentConverter, PdfFormatOption +from docling.document_converter import (DocumentConverter, PdfFormatOption, + PowerpointFormatOption, + WordFormatOption) from docling.models.document_picture_classifier import \ DocumentPictureClassifier from docling.models.layout_model import LayoutModel @@ -15,6 +17,8 @@ from docling_core.types import DoclingDocument from docling_core.types.doc import DocItem from ms_agent.tools.docling.doc_postprocess import PostProcess +from ms_agent.tools.docling.enrich_pipeline import (DocPipelineOptions, + EnrichDocPipeline) from ms_agent.tools.docling.patches import (download_models_ms, download_models_pic_classifier_ms, html_handle_figure, @@ -55,10 +59,30 @@ def __init__(self, verbose: bool = False): pdf_pipeline_options.images_scale = 2.0 pdf_pipeline_options.accelerator_options = accelerator_options # type: ignore + word_pipeline_options = DocPipelineOptions() + word_pipeline_options.do_picture_classification = True + word_pipeline_options.do_code_enrichment = False + word_pipeline_options.do_formula_enrichment = False + word_pipeline_options.accelerator_options = accelerator_options # type: ignore + + ppt_pipeline_options = DocPipelineOptions() + ppt_pipeline_options.do_picture_classification = True + ppt_pipeline_options.do_code_enrichment = False + ppt_pipeline_options.do_formula_enrichment = False + ppt_pipeline_options.accelerator_options = accelerator_options # type: ignore + self._converter = DocumentConverter( format_options={ InputFormat.PDF: - PdfFormatOption(pipeline_options=pdf_pipeline_options) + PdfFormatOption(pipeline_options=pdf_pipeline_options), + InputFormat.DOCX: + WordFormatOption( + pipeline_cls=EnrichDocPipeline, + pipeline_options=word_pipeline_options), + InputFormat.PPTX: + PowerpointFormatOption( + pipeline_cls=EnrichDocPipeline, + pipeline_options=ppt_pipeline_options) }) @staticmethod diff --git a/ms_agent/tools/docling/enrich_pipeline.py b/ms_agent/tools/docling/enrich_pipeline.py new file mode 100644 index 000000000..b2d93abd6 --- /dev/null +++ b/ms_agent/tools/docling/enrich_pipeline.py @@ -0,0 +1,132 @@ +# yapf: disable +from pathlib import Path +from typing import Iterable, List, Optional, Union + +import numpy as np +from docling.datamodel.accelerator_options import AcceleratorOptions +from docling.datamodel.pipeline_options import PipelineOptions +from docling.datamodel.settings import settings +from docling.models.code_formula_model import (CodeFormulaModel, + CodeFormulaModelOptions) +from docling.models.document_picture_classifier import ( + DocumentPictureClassifier, DocumentPictureClassifierOptions) +from docling.pipeline.simple_pipeline import SimplePipeline +from docling_core.types.doc import (DoclingDocument, NodeItem, + PictureClassificationClass, + PictureClassificationData, PictureItem) +from PIL import Image + + +class DocPipelineOptions(PipelineOptions): + """Options for processing Word and PPT documents in the pipeline.""" + + artifacts_path: Optional[Union[Path, str]] = None + do_picture_classification: bool = False # True: classify pictures in documents + do_code_enrichment: bool = False # True: perform code OCR + do_formula_enrichment: bool = False # True: perform formula OCR, return Latex code + + +class EnrichDocumentPictureClassifier(DocumentPictureClassifier): + """ + Specializes DocumentPictureClassifier for robust pipeline processing. + + This classifier is designed to handle document formats like Word and PPT where + images might be missing or invalid. It overrides the default behavior to + skip elements with unreadable images instead of raising an error, thus + preventing the entire processing pipeline from halting. + """ + + def __init__(self, enabled: bool, artifacts_path: Optional[Path], + options: DocumentPictureClassifierOptions, + accelerator_options: AcceleratorOptions): + super().__init__(enabled, artifacts_path, options, accelerator_options) + + def __call__( + self, + doc: DoclingDocument, + element_batch: Iterable[NodeItem], + ) -> Iterable[NodeItem]: + """ + This method iterates through a batch of elements, extracts their images, + and applies the picture classification model. Unlike the base class + implementation, it gracefully handles cases where an image cannot be + retrieved (i.e., `get_image()` returns None) by skipping that element. + This ensures that a single faulty item does not stop the entire batch + processing. + """ + if not self.enabled: + for element in element_batch: + yield element + return + + images: List[Union[Image.Image, np.ndarray]] = [] + elements_with_images: List[PictureItem] = [] + element_batch_list: List[PictureItem] = list(element_batch) + for el in element_batch_list: + assert isinstance( + el, PictureItem), f'Element {el} is not a PictureItem' + img = el.get_image(doc) + if img is not None: + images.append(img) + elements_with_images.append(el) + + if images: + outputs = self.document_picture_classifier.predict(images) + for element, output in zip(elements_with_images, outputs): + element.annotations.append( + PictureClassificationData( + provenance='DocumentPictureClassifier', + predicted_classes=[ + PictureClassificationClass( + class_name=pred[0], + confidence=pred[1], + ) for pred in output + ], + )) + + for element in element_batch_list: + yield element + + +class EnrichDocPipeline(SimplePipeline): + """Pipeline for enriching Word and PPT documents with additional processing steps.""" + + def __init__(self, pipeline_options: DocPipelineOptions): + super().__init__(pipeline_options) + + artifacts_path: Optional[Path] = None + if pipeline_options.artifacts_path is not None: + artifacts_path = Path(pipeline_options.artifacts_path).expanduser() + elif settings.artifacts_path is not None: + artifacts_path = Path(settings.artifacts_path).expanduser() + + if artifacts_path is not None and not artifacts_path.is_dir(): + raise RuntimeError( + f'The value of {artifacts_path=} is not valid. ' + 'When defined, it must point to a folder containing all models required by the pipeline.' + ) + + self.enrichment_pipe = [ + # Code Formula Enrichment Model + CodeFormulaModel( + enabled=pipeline_options.do_code_enrichment + or pipeline_options.do_formula_enrichment, + artifacts_path=artifacts_path, + options=CodeFormulaModelOptions( + do_code_enrichment=pipeline_options.do_code_enrichment, + do_formula_enrichment=pipeline_options.do_formula_enrichment, + ), + accelerator_options=pipeline_options.accelerator_options, + ), + # Document Picture Classifier + EnrichDocumentPictureClassifier( + enabled=pipeline_options.do_picture_classification, + artifacts_path=artifacts_path, + options=DocumentPictureClassifierOptions(), + accelerator_options=pipeline_options.accelerator_options, + ) + ] + + @classmethod + def get_default_options(cls) -> DocPipelineOptions: + return DocPipelineOptions() From 53b6523ddc4e2cc94c6439ae1232b055e2f886ad Mon Sep 17 00:00:00 2001 From: alcholiclg Date: Thu, 14 Aug 2025 17:50:47 +0800 Subject: [PATCH 2/3] Support for filtering images using classification labels. --- ms_agent/tools/docling/doc_postprocess.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ms_agent/tools/docling/doc_postprocess.py b/ms_agent/tools/docling/doc_postprocess.py index 2e21051d9..e8b614210 100644 --- a/ms_agent/tools/docling/doc_postprocess.py +++ b/ms_agent/tools/docling/doc_postprocess.py @@ -22,4 +22,14 @@ def filter(doc: DoclingDocument) -> Union[DoclingDocument, None]: if pic_item.image.size.height * pic_item.image.size.width < PostProcess.MIN_PICTURE_SIZE: pic_item.image = None # Remove image if too small + if hasattr(pic_item, 'annotations') and pic_item.annotations: + pic_classes = getattr(pic_item.annotations[0], + 'predicted_classes', None) + pic_class = pic_classes[ + 0].class_name if pic_classes else None # Get the first predicted class if available + if pic_class is not None and pic_class.lower() in [ + 'qr_code', 'logo', 'icon' + ]: + pic_item.image = None + return doc From d46367aeb389d50d1a2bd5cc1dbcd014fd4dcd82 Mon Sep 17 00:00:00 2001 From: alcholiclg Date: Fri, 15 Aug 2025 00:35:08 +0800 Subject: [PATCH 3/3] improve code maintainability --- ms_agent/tools/docling/doc_loader.py | 20 +++++++------------- ms_agent/tools/docling/doc_postprocess.py | 5 ++--- 2 files changed, 9 insertions(+), 16 deletions(-) diff --git a/ms_agent/tools/docling/doc_loader.py b/ms_agent/tools/docling/doc_loader.py index 2d7833752..c4952cd0f 100644 --- a/ms_agent/tools/docling/doc_loader.py +++ b/ms_agent/tools/docling/doc_loader.py @@ -59,17 +59,11 @@ def __init__(self, verbose: bool = False): pdf_pipeline_options.images_scale = 2.0 pdf_pipeline_options.accelerator_options = accelerator_options # type: ignore - word_pipeline_options = DocPipelineOptions() - word_pipeline_options.do_picture_classification = True - word_pipeline_options.do_code_enrichment = False - word_pipeline_options.do_formula_enrichment = False - word_pipeline_options.accelerator_options = accelerator_options # type: ignore - - ppt_pipeline_options = DocPipelineOptions() - ppt_pipeline_options.do_picture_classification = True - ppt_pipeline_options.do_code_enrichment = False - ppt_pipeline_options.do_formula_enrichment = False - ppt_pipeline_options.accelerator_options = accelerator_options # type: ignore + doc_pipeline_options = DocPipelineOptions() + doc_pipeline_options.do_picture_classification = True + doc_pipeline_options.do_code_enrichment = False + doc_pipeline_options.do_formula_enrichment = False + doc_pipeline_options.accelerator_options = accelerator_options # type: ignore self._converter = DocumentConverter( format_options={ @@ -78,11 +72,11 @@ def __init__(self, verbose: bool = False): InputFormat.DOCX: WordFormatOption( pipeline_cls=EnrichDocPipeline, - pipeline_options=word_pipeline_options), + pipeline_options=doc_pipeline_options), InputFormat.PPTX: PowerpointFormatOption( pipeline_cls=EnrichDocPipeline, - pipeline_options=ppt_pipeline_options) + pipeline_options=doc_pipeline_options) }) @staticmethod diff --git a/ms_agent/tools/docling/doc_postprocess.py b/ms_agent/tools/docling/doc_postprocess.py index e8b614210..a0348d1b3 100644 --- a/ms_agent/tools/docling/doc_postprocess.py +++ b/ms_agent/tools/docling/doc_postprocess.py @@ -6,6 +6,7 @@ class PostProcess: MIN_PICTURE_SIZE = 200.0 * 200.0 # Minimum size for pictures in pixels + FILTERED_IMAGE_LABELS = {'qr_code', 'logo', 'icon'} def __init__(self): ... @@ -27,9 +28,7 @@ def filter(doc: DoclingDocument) -> Union[DoclingDocument, None]: 'predicted_classes', None) pic_class = pic_classes[ 0].class_name if pic_classes else None # Get the first predicted class if available - if pic_class is not None and pic_class.lower() in [ - 'qr_code', 'logo', 'icon' - ]: + if pic_class is not None and pic_class.lower() in PostProcess.FILTERED_IMAGE_LABELS: # yapf: disable pic_item.image = None return doc