From c90ee891d632efd0b6d865ea243c900a4a10a67b Mon Sep 17 00:00:00 2001 From: myhloli Date: Fri, 9 Aug 2024 20:09:53 +0800 Subject: [PATCH] feat(draw_bbox): add model bbox drawing functionality Implement the feature to draw bounding boxes for model elements in the PDF. This includes adding new drawing functions and modifying existing ones to accommodate the new feature. Also, updates are made to CLI tools and common utilities to support the model bbox drawing. --- magic_pdf/libs/draw_bbox.py | 67 +++++++++++++++++++++++++++++- magic_pdf/libs/ocr_content_type.py | 14 +++++++ magic_pdf/tools/cli_dev.py | 2 + magic_pdf/tools/common.py | 5 ++- 4 files changed, 86 insertions(+), 2 deletions(-) diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py index 65b78152..20e54342 100644 --- a/magic_pdf/libs/draw_bbox.py +++ b/magic_pdf/libs/draw_bbox.py @@ -1,6 +1,7 @@ from magic_pdf.libs.Constants import CROSS_PAGE from magic_pdf.libs.commons import fitz # PyMuPDF -from magic_pdf.libs.ocr_content_type import ContentType, BlockType +from magic_pdf.libs.ocr_content_type import ContentType, BlockType, CategoryId +from magic_pdf.model.magic_model import MagicModel def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config): @@ -225,3 +226,67 @@ def get_span_info(span): # Save the PDF pdf_docs.save(f"{out_path}/spans.pdf") + + +def drow_model_bbox(model_list: list, pdf_bytes, out_path): + dropped_bbox_list = [] + tables_body_list, tables_caption_list, tables_footnote_list = [], [], [] + imgs_body_list, imgs_caption_list = [], [] + titles_list = [] + texts_list = [] + interequations_list = [] + pdf_docs = fitz.open("pdf", pdf_bytes) + magic_model = MagicModel(model_list, pdf_docs) + for i in range(len(model_list)): + page_dropped_list = [] + tables_body, tables_caption, tables_footnote = [], [], [] + imgs_body, imgs_caption = [], [] + titles = [] + texts = [] + interequations = [] + page_info = magic_model.get_model_list(i) + layout_dets = page_info["layout_dets"] + for layout_det in layout_dets: + bbox = layout_det["bbox"] + if layout_det["category_id"] == CategoryId.Text: + texts.append(bbox) + elif layout_det["category_id"] == CategoryId.Title: + titles.append(bbox) + elif layout_det["category_id"] == CategoryId.TableBody: + tables_body.append(bbox) + elif layout_det["category_id"] == CategoryId.TableCaption: + tables_caption.append(bbox) + elif layout_det["category_id"] == CategoryId.TableFootnote: + tables_footnote.append(bbox) + elif layout_det["category_id"] == CategoryId.ImageBody: + imgs_body.append(bbox) + elif layout_det["category_id"] == CategoryId.ImageCaption: + imgs_caption.append(bbox) + elif layout_det["category_id"] == CategoryId.InterlineEquation_YOLO: + interequations.append(bbox) + elif layout_det["category_id"] == CategoryId.Abandon: + page_dropped_list.append(bbox) + + tables_body_list.append(tables_body) + tables_caption_list.append(tables_caption) + tables_footnote_list.append(tables_footnote) + imgs_body_list.append(imgs_body) + imgs_caption_list.append(imgs_caption) + titles_list.append(titles) + texts_list.append(texts) + interequations_list.append(interequations) + dropped_bbox_list.append(page_dropped_list) + + for i, page in enumerate(pdf_docs): + draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], True) # color ! + draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True) + draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True) + draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True) + draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True) + draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True) + draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True) + draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True) + draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True) + + # Save the PDF + pdf_docs.save(f"{out_path}/model.pdf") \ No newline at end of file diff --git a/magic_pdf/libs/ocr_content_type.py b/magic_pdf/libs/ocr_content_type.py index 1886c82a..bb7e7752 100644 --- a/magic_pdf/libs/ocr_content_type.py +++ b/magic_pdf/libs/ocr_content_type.py @@ -19,3 +19,17 @@ class BlockType: Footnote = "footnote" Discarded = "discarded" + +class CategoryId: + Title = 0 + Text = 1 + Abandon = 2 + ImageBody = 3 + ImageCaption = 4 + TableBody = 5 + TableCaption = 6 + TableFootnote = 7 + InterlineEquation_Layout = 8 + InlineEquation = 13 + InterlineEquation_YOLO = 14 + OcrText = 15 diff --git a/magic_pdf/tools/cli_dev.py b/magic_pdf/tools/cli_dev.py index e226cff0..14961bc0 100644 --- a/magic_pdf/tools/cli_dev.py +++ b/magic_pdf/tools/cli_dev.py @@ -94,6 +94,7 @@ def jsonl(jsonl, method, output_dir): jso["doc_layout_result"], method, f_dump_content_list=True, + f_draw_model_bbox=True, ) @@ -146,6 +147,7 @@ def read_fn(path): model_json_list, method, f_dump_content_list=True, + f_draw_model_bbox=True, ) diff --git a/magic_pdf/tools/common.py b/magic_pdf/tools/common.py index ac27ef21..6ddea698 100644 --- a/magic_pdf/tools/common.py +++ b/magic_pdf/tools/common.py @@ -4,7 +4,7 @@ import click from loguru import logger from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode -from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox +from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.TXTPipe import TXTPipe @@ -37,6 +37,7 @@ def do_parse( f_dump_orig_pdf=True, f_dump_content_list=False, f_make_md_mode=MakeMode.MM_MD, + f_draw_model_bbox=False, ): orig_model_list = copy.deepcopy(model_list) local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method) @@ -73,6 +74,8 @@ def do_parse( draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir) if f_draw_span_bbox: draw_span_bbox(pdf_info, pdf_bytes, local_md_dir) + if f_draw_model_bbox: + drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir) md_content = pipe.pipe_mk_markdown( image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode