diff --git a/magic_pdf/libs/draw_bbox.py b/magic_pdf/libs/draw_bbox.py index 65b78152..20e54342 100644 --- a/magic_pdf/libs/draw_bbox.py +++ b/magic_pdf/libs/draw_bbox.py @@ -1,6 +1,7 @@ from magic_pdf.libs.Constants import CROSS_PAGE from magic_pdf.libs.commons import fitz # PyMuPDF -from magic_pdf.libs.ocr_content_type import ContentType, BlockType +from magic_pdf.libs.ocr_content_type import ContentType, BlockType, CategoryId +from magic_pdf.model.magic_model import MagicModel def draw_bbox_without_number(i, bbox_list, page, rgb_config, fill_config): @@ -225,3 +226,67 @@ def get_span_info(span): # Save the PDF pdf_docs.save(f"{out_path}/spans.pdf") + + +def drow_model_bbox(model_list: list, pdf_bytes, out_path): + dropped_bbox_list = [] + tables_body_list, tables_caption_list, tables_footnote_list = [], [], [] + imgs_body_list, imgs_caption_list = [], [] + titles_list = [] + texts_list = [] + interequations_list = [] + pdf_docs = fitz.open("pdf", pdf_bytes) + magic_model = MagicModel(model_list, pdf_docs) + for i in range(len(model_list)): + page_dropped_list = [] + tables_body, tables_caption, tables_footnote = [], [], [] + imgs_body, imgs_caption = [], [] + titles = [] + texts = [] + interequations = [] + page_info = magic_model.get_model_list(i) + layout_dets = page_info["layout_dets"] + for layout_det in layout_dets: + bbox = layout_det["bbox"] + if layout_det["category_id"] == CategoryId.Text: + texts.append(bbox) + elif layout_det["category_id"] == CategoryId.Title: + titles.append(bbox) + elif layout_det["category_id"] == CategoryId.TableBody: + tables_body.append(bbox) + elif layout_det["category_id"] == CategoryId.TableCaption: + tables_caption.append(bbox) + elif layout_det["category_id"] == CategoryId.TableFootnote: + tables_footnote.append(bbox) + elif layout_det["category_id"] == CategoryId.ImageBody: + imgs_body.append(bbox) + elif layout_det["category_id"] == CategoryId.ImageCaption: + imgs_caption.append(bbox) + elif layout_det["category_id"] == CategoryId.InterlineEquation_YOLO: + interequations.append(bbox) + elif layout_det["category_id"] == CategoryId.Abandon: + page_dropped_list.append(bbox) + + tables_body_list.append(tables_body) + tables_caption_list.append(tables_caption) + tables_footnote_list.append(tables_footnote) + imgs_body_list.append(imgs_body) + imgs_caption_list.append(imgs_caption) + titles_list.append(titles) + texts_list.append(texts) + interequations_list.append(interequations) + dropped_bbox_list.append(page_dropped_list) + + for i, page in enumerate(pdf_docs): + draw_bbox_with_number(i, dropped_bbox_list, page, [158, 158, 158], True) # color ! + draw_bbox_with_number(i, tables_body_list, page, [204, 204, 0], True) + draw_bbox_with_number(i, tables_caption_list, page, [255, 255, 102], True) + draw_bbox_with_number(i, tables_footnote_list, page, [229, 255, 204], True) + draw_bbox_with_number(i, imgs_body_list, page, [153, 255, 51], True) + draw_bbox_with_number(i, imgs_caption_list, page, [102, 178, 255], True) + draw_bbox_with_number(i, titles_list, page, [102, 102, 255], True) + draw_bbox_with_number(i, texts_list, page, [153, 0, 76], True) + draw_bbox_with_number(i, interequations_list, page, [0, 255, 0], True) + + # Save the PDF + pdf_docs.save(f"{out_path}/model.pdf") \ No newline at end of file diff --git a/magic_pdf/libs/ocr_content_type.py b/magic_pdf/libs/ocr_content_type.py index 1886c82a..bb7e7752 100644 --- a/magic_pdf/libs/ocr_content_type.py +++ b/magic_pdf/libs/ocr_content_type.py @@ -19,3 +19,17 @@ class BlockType: Footnote = "footnote" Discarded = "discarded" + +class CategoryId: + Title = 0 + Text = 1 + Abandon = 2 + ImageBody = 3 + ImageCaption = 4 + TableBody = 5 + TableCaption = 6 + TableFootnote = 7 + InterlineEquation_Layout = 8 + InlineEquation = 13 + InterlineEquation_YOLO = 14 + OcrText = 15 diff --git a/magic_pdf/tools/cli_dev.py b/magic_pdf/tools/cli_dev.py index e226cff0..14961bc0 100644 --- a/magic_pdf/tools/cli_dev.py +++ b/magic_pdf/tools/cli_dev.py @@ -94,6 +94,7 @@ def jsonl(jsonl, method, output_dir): jso["doc_layout_result"], method, f_dump_content_list=True, + f_draw_model_bbox=True, ) @@ -146,6 +147,7 @@ def read_fn(path): model_json_list, method, f_dump_content_list=True, + f_draw_model_bbox=True, ) diff --git a/magic_pdf/tools/common.py b/magic_pdf/tools/common.py index ac27ef21..6ddea698 100644 --- a/magic_pdf/tools/common.py +++ b/magic_pdf/tools/common.py @@ -4,7 +4,7 @@ import click from loguru import logger from magic_pdf.libs.MakeContentConfig import DropMode, MakeMode -from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox +from magic_pdf.libs.draw_bbox import draw_layout_bbox, draw_span_bbox, drow_model_bbox from magic_pdf.pipe.UNIPipe import UNIPipe from magic_pdf.pipe.OCRPipe import OCRPipe from magic_pdf.pipe.TXTPipe import TXTPipe @@ -37,6 +37,7 @@ def do_parse( f_dump_orig_pdf=True, f_dump_content_list=False, f_make_md_mode=MakeMode.MM_MD, + f_draw_model_bbox=False, ): orig_model_list = copy.deepcopy(model_list) local_image_dir, local_md_dir = prepare_env(output_dir, pdf_file_name, parse_method) @@ -73,6 +74,8 @@ def do_parse( draw_layout_bbox(pdf_info, pdf_bytes, local_md_dir) if f_draw_span_bbox: draw_span_bbox(pdf_info, pdf_bytes, local_md_dir) + if f_draw_model_bbox: + drow_model_bbox(orig_model_list, pdf_bytes, local_md_dir) md_content = pipe.pipe_mk_markdown( image_dir, drop_mode=DropMode.NONE, md_make_mode=f_make_md_mode