From 09269c845eeef745f44d93acf567e89a8a2d1871 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E7=91=9E?= Date: Wed, 20 Mar 2024 21:52:50 +0800 Subject: [PATCH 1/2] feat: add extract_train_data --- magic_pdf/pdf_parse_for_train.py | 684 ++++++++++++++++++ magic_pdf/pipeline.py | 444 ++++++++---- magic_pdf/train_utils/__init__.py | 0 .../train_utils/convert_to_train_format.py | 52 ++ magic_pdf/train_utils/extract_caption.py | 59 ++ magic_pdf/train_utils/remove_footer_header.py | 159 ++++ magic_pdf/train_utils/vis_utils.py | 327 +++++++++ 7 files changed, 1591 insertions(+), 134 deletions(-) create mode 100644 magic_pdf/pdf_parse_for_train.py create mode 100644 magic_pdf/train_utils/__init__.py create mode 100644 magic_pdf/train_utils/convert_to_train_format.py create mode 100644 magic_pdf/train_utils/extract_caption.py create mode 100644 magic_pdf/train_utils/remove_footer_header.py create mode 100644 magic_pdf/train_utils/vis_utils.py diff --git a/magic_pdf/pdf_parse_for_train.py b/magic_pdf/pdf_parse_for_train.py new file mode 100644 index 00000000..8c254f80 --- /dev/null +++ b/magic_pdf/pdf_parse_for_train.py @@ -0,0 +1,684 @@ +import time + +# from anyio import Path + +from magic_pdf.libs.commons import ( + fitz, + get_delta_time, + get_img_s3_client, + get_docx_model_output, +) +import json +import os +from copy import deepcopy +import math +from loguru import logger +from magic_pdf.layout.bbox_sort import ( + prepare_bboxes_for_layout_split, +) +from magic_pdf.layout.layout_sort import ( + LAYOUT_UNPROC, + get_bboxes_layout, + get_columns_cnt_of_layout, + sort_text_block, +) +from magic_pdf.libs.drop_reason import DropReason +from magic_pdf.libs.markdown_utils import escape_special_markdown_char +from magic_pdf.libs.safe_filename import sanitize_filename +from magic_pdf.libs.vis_utils import draw_bbox_on_page, draw_layout_bbox_on_page +from magic_pdf.pre_proc.detect_images import parse_images +from magic_pdf.pre_proc.detect_tables import parse_tables # 获取tables的bbox +from magic_pdf.pre_proc.detect_equation import parse_equations # 获取equations的bbox +from magic_pdf.pre_proc.detect_header import parse_headers # 获取headers的bbox +from magic_pdf.pre_proc.detect_page_number import parse_pageNos # 获取pageNos的bbox +from magic_pdf.pre_proc.detect_footnote import ( + parse_footnotes_by_model, + parse_footnotes_by_rule, +) # 获取footnotes的bbox +from magic_pdf.pre_proc.detect_footer_by_model import parse_footers # 获取footers的bbox + +from magic_pdf.post_proc.detect_para import ( + ParaProcessPipeline, + TitleDetectionException, + TitleLevelException, + ParaSplitException, + ParaMergeException, + DenseSingleLineBlockException, +) +from magic_pdf.pre_proc.main_text_font import get_main_text_font +from magic_pdf.pre_proc.remove_colored_strip_bbox import remove_colored_strip_textblock +from magic_pdf.pre_proc.remove_footer_header import remove_headder_footer_one_page +from magic_pdf.train_utils.extract_caption import extract_caption_bbox + +""" +from para.para_pipeline import ParaProcessPipeline +from para.exceptions import ( + TitleDetectionException, + TitleLevelException, + ParaSplitException, + ParaMergeException, + DenseSingleLineBlockException, +) +""" + +from magic_pdf.libs.commons import read_file, join_path +from magic_pdf.libs.pdf_image_tools import save_images_by_bboxes +from magic_pdf.post_proc.remove_footnote import ( + merge_footnote_blocks, + remove_footnote_blocks, +) +from magic_pdf.pre_proc.citationmarker_remove import remove_citation_marker +from magic_pdf.pre_proc.equations_replace import ( + combine_chars_to_pymudict, + remove_chars_in_text_blocks, + replace_equations_in_textblock, +) +from magic_pdf.pre_proc.pdf_pre_filter import pdf_filter +from magic_pdf.pre_proc.detect_footer_header_by_statistics import drop_footer_header +from magic_pdf.pre_proc.construct_paras import construct_page_component +from magic_pdf.pre_proc.fix_image import ( + combine_images, + fix_image_vertical, + fix_seperated_image, + include_img_title, +) +from magic_pdf.post_proc.pdf_post_filter import pdf_post_filter +from magic_pdf.pre_proc.remove_rotate_bbox import ( + get_side_boundry, + remove_rotate_side_textblock, + remove_side_blank_block, +) +from magic_pdf.pre_proc.resolve_bbox_conflict import ( + check_text_block_horizontal_overlap, + resolve_bbox_overlap_conflict, +) +from magic_pdf.pre_proc.fix_table import ( + fix_table_text_block, + fix_tables, + include_table_title, +) +from magic_pdf.pre_proc.solve_line_alien import solve_inline_too_large_interval + +denseSingleLineBlockException_msg = DenseSingleLineBlockException().message +titleDetectionException_msg = TitleDetectionException().message +titleLevelException_msg = TitleLevelException().message +paraSplitException_msg = ParaSplitException().message +paraMergeException_msg = ParaMergeException().message + + +def parse_pdf_for_train( + s3_pdf_path, + s3_pdf_profile, + pdf_model_output, + save_path, + book_name, + pdf_model_profile=None, + image_s3_config=None, + start_page_id=0, + end_page_id=None, + junk_img_bojids=[], + debug_mode=False, +): + pdf_bytes = read_file(s3_pdf_path, s3_pdf_profile) + save_tmp_path = os.path.join(os.path.dirname(__file__), "../..", "tmp", "unittest") + md_bookname_save_path = "" + book_name = sanitize_filename(book_name) + if debug_mode: + save_path = join_path(save_tmp_path, "md") + pdf_local_path = join_path(save_tmp_path, "download-pdfs", book_name) + + if not os.path.exists(os.path.dirname(pdf_local_path)): + # 如果目录不存在,创建它 + os.makedirs(os.path.dirname(pdf_local_path)) + + md_bookname_save_path = join_path(save_tmp_path, "md", book_name) + if not os.path.exists(md_bookname_save_path): + # 如果目录不存在,创建它 + os.makedirs(md_bookname_save_path) + + with open(pdf_local_path + ".pdf", "wb") as pdf_file: + pdf_file.write(pdf_bytes) + + pdf_docs = fitz.open("pdf", pdf_bytes) + pdf_info_dict = {} + img_s3_client = get_img_s3_client( + save_path, image_s3_config + ) # 更改函数名和参数,避免歧义 + # img_s3_client = "img_s3_client" #不创建这个对象,直接用字符串占位 + + start_time = time.time() + + """通过统计pdf全篇文字,识别正文字体""" + main_text_font = get_main_text_font(pdf_docs) + + end_page_id = end_page_id if end_page_id else len(pdf_docs) - 1 + for page_id in range(start_page_id, end_page_id + 1): + page = pdf_docs[page_id] + page_width = page.rect.width + page_height = page.rect.height + + if debug_mode: + time_now = time.time() + logger.info( + f"page_id: {page_id}, last_page_cost_time: {get_delta_time(start_time)}" + ) + start_time = time_now + """ + # 通过一个规则,过滤掉单页超过1500非junkimg的pdf + # 对单页面非重复id的img数量做统计,如果当前页超过1500则直接return need_drop + """ + page_imgs = page.get_images() + img_counts = 0 + for img in page_imgs: + img_bojid = img[0] + if img_bojid in junk_img_bojids: # 判断这个图片在不在junklist中 + continue # 如果在junklist就不用管了,跳过 + else: + recs = page.get_image_rects(img, transform=True) + if recs: # 如果这张图在当前页面有展示 + img_counts += 1 + if ( + img_counts >= 1500 + ): # 如果去除了junkimg的影响,单页img仍然超过1500的话,就排除当前pdf + logger.warning( + f"page_id: {page_id}, img_counts: {img_counts}, drop this pdf: {book_name}, drop_reason: {DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS}" + ) + result = { + "need_drop": True, + "drop_reason": DropReason.HIGH_COMPUTATIONAL_lOAD_BY_IMGS, + } + if not debug_mode: + return result + + """ + ================================================================================================================================== + 首先获取基本的block数据,对pdf进行分解,获取图片、表格、公式、text的bbox + """ + # 解析pdf原始文本block + text_raw_blocks = page.get_text( + "dict", + flags=fitz.TEXTFLAGS_TEXT, + )["blocks"] + model_output_json = get_docx_model_output( + pdf_model_output, pdf_model_profile, page_id + ) + + # 解析图片 + image_bboxes = parse_images(page_id, page, model_output_json, junk_img_bojids) + image_bboxes = fix_image_vertical( + image_bboxes, text_raw_blocks + ) # 修正图片的位置 + image_bboxes = fix_seperated_image(image_bboxes) # 合并有边重合的图片 + + old_image_bboxes = deepcopy(image_bboxes) + image_bboxes = include_img_title( + text_raw_blocks, image_bboxes + ) # 向图片上方和下方寻找title,使用规则进行匹配,暂时只支持英文规则 + """此时image_bboxes中可能出现这种情况,水平并列的2个图片,下方分别有各自的子标题,2个子标题下方又有大标题(形如Figxxx),会出现2个图片的bbox都包含了这个大标题,这种情况需要把图片合并""" + image_bboxes = combine_images(image_bboxes) # 合并图片 + + # 解析表格并对table_bboxes进行位置的微调,防止表格周围的文字被截断 + table_bboxes = parse_tables(page_id, page, model_output_json) + table_bboxes = fix_tables( + page, table_bboxes, include_table_title=True, scan_line_num=2 + ) # 修正 + table_bboxes = fix_table_text_block( + text_raw_blocks, table_bboxes + ) # 修正与text block的关系,某些table修正与pymupdf获取到的table内textblock没有完全包含,因此要进行一次修正。 + # debug_show_bbox(pdf_docs, page_id, table_bboxes, [], [b['bbox'] for b in text_raw_blocks], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 7) + + old_table_bboxes = deepcopy(table_bboxes) + table_bboxes = include_table_title( + text_raw_blocks, table_bboxes + ) # 向table上方和下方寻找title,使用规则进行匹配,暂时只支持英文规则 + + # 解析公式 + equations_inline_bboxes, equations_interline_bboxes = parse_equations( + page_id, page, model_output_json + ) + + # get image box and caption ! + image_bboxes_with_caption = extract_caption_bbox(image_bboxes, old_image_bboxes) + + # get table box and caption ! + table_bboxes_with_caption = extract_caption_bbox(table_bboxes, old_table_bboxes) + + """ + ================================================================================================================================== + 进入预处理-1阶段 + ------------------- + # # 解析标题 + # title_bboxs = parse_titles(page_id, page, model_output_json) + # # 评估Layout是否规整、简单 + # isSimpleLayout_flag, fullColumn_cnt, subColumn_cnt, curPage_loss = evaluate_pdf_layout(page_id, page, model_output_json) + 接下来开始进行预处理过程 + """ + + """去掉每页的页码、页眉、页脚""" + page_no_bboxs = parse_pageNos(page_id, page, model_output_json) + header_bboxs = parse_headers(page_id, page, model_output_json) + footer_bboxs = parse_footers(page_id, page, model_output_json) + ( + image_bboxes, + table_bboxes, + remain_text_blocks, + removed_hdr_foot_txt_block, + removed_hdr_foot_img_block, + removed_hdr_foot_table, + ) = remove_headder_footer_one_page( + text_raw_blocks, + image_bboxes, + table_bboxes, + header_bboxs, + footer_bboxs, + page_no_bboxs, + page_width, + page_height, + ) + + """去除页面上半部分长条色块内的文本块""" + remain_text_blocks, removed_colored_narrow_strip_background_text_block = ( + remove_colored_strip_textblock(remain_text_blocks, page) + ) + + # debug_show_bbox(pdf_docs, page_id, footnote_bboxes_by_model, [b['bbox'] for b in remain_text_blocks], header_bboxs, join_path(save_path, book_name, f"{book_name}_debug.pdf"), 7) + + """去掉旋转的文字:水印、垂直排列的文字""" + remain_text_blocks, removed_non_horz_text_block = remove_rotate_side_textblock( + remain_text_blocks, page_width, page_height + ) # 去掉水印,非水平文字 + remain_text_blocks, removed_empty_side_block = remove_side_blank_block( + remain_text_blocks, page_width, page_height + ) # 删除页面四周可能会留下的完全空白的textblock,这种block形成原因未知 + + """出现在图片、表格上的文字块去掉,把层叠的图片单独分离出来,不参与layout的计算""" + ( + image_bboxes, + table_bboxes, + equations_interline_bboxes, + equations_inline_bboxes, + remain_text_blocks, + text_block_on_image_removed, + images_overlap_backup, + interline_eq_temp_text_block, + ) = resolve_bbox_overlap_conflict( + image_bboxes, + table_bboxes, + equations_interline_bboxes, + equations_inline_bboxes, + remain_text_blocks, + ) + + # """去掉footnote, 从文字和图片中""" + # # 通过模型识别到的footnote + # footnote_bboxes_by_model = parse_footnotes_by_model(page_id, page, model_output_json, md_bookname_save_path, + # debug_mode=debug_mode) + # # 通过规则识别到的footnote + # footnote_bboxes_by_rule = parse_footnotes_by_rule(remain_text_blocks, page_height, page_id) + """ + ================================================================================================================================== + """ + if debug_mode: # debugmode截图到本地 + save_path = join_path(save_tmp_path, "md") + + # 把图、表、公式都进行截图,保存到存储上,返回图片路径作为内容 + image_info, image_backup_info, table_info, inline_eq_info, interline_eq_info = ( + save_images_by_bboxes( + book_name, + page_id, + page, + save_path, + image_bboxes, + images_overlap_backup, + table_bboxes, + equations_inline_bboxes, + equations_interline_bboxes, + # 传入img_s3_client + img_s3_client, + ) + ) # 只要表格和图片的截图 + + """"以下进入到公式替换环节 """ + char_level_text_blocks = page.get_text("rawdict", flags=fitz.TEXTFLAGS_TEXT)[ + "blocks" + ] + remain_text_blocks = combine_chars_to_pymudict( + remain_text_blocks, char_level_text_blocks + ) # 合并chars + remain_text_blocks = replace_equations_in_textblock( + remain_text_blocks, inline_eq_info, interline_eq_info + ) + remain_text_blocks = remove_citation_marker( + remain_text_blocks + ) # 公式替换之后去角标,防止公式无法替换成功。但是这样也会带来个问题就是把角标当公式。各有优劣。 + remain_text_blocks = remove_chars_in_text_blocks( + remain_text_blocks + ) # 减少中间态数据体积 + # debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in inline_eq_info], [b['bbox'] for b in interline_eq_info], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 3) + + """去掉footnote, 从文字和图片中(先去角标再去footnote试试)""" + # 通过模型识别到的footnote + footnote_bboxes_by_model = parse_footnotes_by_model( + page_id, + page, + model_output_json, + md_bookname_save_path, + debug_mode=debug_mode, + ) + # 通过规则识别到的footnote + footnote_bboxes_by_rule = parse_footnotes_by_rule( + remain_text_blocks, page_height, page_id, main_text_font + ) + """进入pdf过滤器,去掉一些不合理的pdf""" + is_good_pdf, err = pdf_filter( + page, remain_text_blocks, table_bboxes, image_bboxes + ) + if not is_good_pdf: + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {err}" + ) + if not debug_mode: + return err + + """ + ================================================================================================================================== + 进行版面布局切分和过滤 + """ + """在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """ + + is_text_block_horz_overlap = check_text_block_horizontal_overlap( + remain_text_blocks, header_bboxs, footer_bboxs + ) + + if is_text_block_horz_overlap: + # debug_show_bbox(pdf_docs, page_id, [b['bbox'] for b in remain_text_blocks], [], [], join_path(save_path, book_name, f"{book_name}_debug.pdf"), 0) + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}" + ) + result = { + "need_drop": True, + "drop_reason": DropReason.TEXT_BLCOK_HOR_OVERLAP, + } + if not debug_mode: + return result + + """统一格式化成一个数据结构用于计算layout""" + page_y0 = 0 if len(header_bboxs) == 0 else max([b[3] for b in header_bboxs]) + page_y1 = ( + page_height if len(footer_bboxs) == 0 else min([b[1] for b in footer_bboxs]) + ) + left_x, right_x = get_side_boundry( + removed_non_horz_text_block, page_width, page_height + ) + page_boundry = [ + math.floor(left_x), + page_y0 + 1, + math.ceil(right_x), + page_y1 - 1, + ] + # 返回的是一个数组,每个元素[x0, y0, x1, y1, block_content, idx_x, idx_y], 初始时候idx_x, idx_y都是None. 对于图片、公式来说,block_content是图片的地址, 对于段落来说,block_content是段落的内容 + + all_bboxes = prepare_bboxes_for_layout_split( + image_info, + image_backup_info, + table_info, + inline_eq_info, + interline_eq_info, + remain_text_blocks, + page_boundry, + page, + ) + # debug_show_bbox(pdf_docs, page_id, [], [], all_bboxes, join_path(save_path, book_name, f"{book_name}_debug.pdf"), 1) + """page_y0, page_y1能够过滤掉页眉和页脚,不会算作layout内""" + layout_bboxes, layout_tree = get_bboxes_layout( + all_bboxes, page_boundry, page_id + ) + + if ( + len(remain_text_blocks) > 0 + and len(all_bboxes) > 0 + and len(layout_bboxes) == 0 + ): + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}" + ) + result = { + "need_drop": True, + "drop_reason": DropReason.CAN_NOT_DETECT_PAGE_LAYOUT, + } + if not debug_mode: + return result + + """以下去掉复杂的布局和超过2列的布局""" + if any( + [lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes] + ): # 复杂的布局 + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.COMPLICATED_LAYOUT}" + ) + result = {"need_drop": True, "drop_reason": DropReason.COMPLICATED_LAYOUT} + if not debug_mode: + return result + + layout_column_width = get_columns_cnt_of_layout(layout_tree) + if layout_column_width > 2: # 去掉超过2列的布局pdf + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}" + ) + result = { + "need_drop": True, + "drop_reason": DropReason.TOO_MANY_LAYOUT_COLUMNS, + "extra_info": {"column_cnt": layout_column_width}, + } + if not debug_mode: + return result + + """ + ================================================================================================================================== + 构造出下游需要的数据结构 + """ + remain_text_blocks = ( + remain_text_blocks + interline_eq_temp_text_block + ) # 把计算layout时候临时删除的行间公式再放回去,防止行间公式替换的时候丢失。 + removed_text_blocks = [] + removed_text_blocks.extend(removed_hdr_foot_txt_block) + # removed_text_blocks.extend(removed_footnote_text_block) + removed_text_blocks.extend(text_block_on_image_removed) + removed_text_blocks.extend(removed_non_horz_text_block) + removed_text_blocks.extend(removed_colored_narrow_strip_background_text_block) + + removed_images = [] + # removed_images.extend(footnote_imgs) + removed_images.extend(removed_hdr_foot_img_block) + + images_backup = [] + images_backup.extend(image_backup_info) + remain_text_blocks = escape_special_markdown_char( + remain_text_blocks + ) # 转义span里的text + sorted_text_remain_text_block = sort_text_block( + remain_text_blocks, layout_bboxes + ) + + footnote_bboxes_tmp = [] + footnote_bboxes_tmp.extend(footnote_bboxes_by_model) + footnote_bboxes_tmp.extend(footnote_bboxes_by_rule) + + page_info = construct_page_component( + page_id, + image_info, + table_info, + sorted_text_remain_text_block, + layout_bboxes, + inline_eq_info, + interline_eq_info, + page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"], + removed_text_blocks=removed_text_blocks, + removed_image_blocks=removed_images, + images_backup=images_backup, + droped_table_block=[], + table_backup=[], + layout_tree=layout_tree, + page_w=page.rect.width, + page_h=page.rect.height, + footnote_bboxes_tmp=footnote_bboxes_tmp, + ) + + page_info["image_bboxes_with_caption"] = image_bboxes_with_caption # add by xr + page_info["table_bboxes_with_caption"] = table_bboxes_with_caption + + page_info["bak_page_no_bboxes"] = page_no_bboxs + page_info["bak_header_bboxes"] = header_bboxs + page_info["bak_footer_bboxes"] = footer_bboxs + + pdf_info_dict[f"page_{page_id}"] = page_info + + # end page for + + """计算后处理阶段耗时""" + start_time = time.time() + + """ + ================================================================================================================================== + 去掉页眉和页脚,这里需要用到一定的统计量,所以放到最后 + 页眉和页脚主要从文本box和图片box中去除,位于页面的四周。 + 下面函数会直接修改pdf_info_dict,从文字块中、图片中删除属于页眉页脚的内容,删除内容做相对应记录 + """ + # 去页眉页脚 + header, footer = drop_footer_header( + pdf_info_dict + ) # TODO: using header and footer boxes here ! + + """对单个layout内footnote和他下面的所有textbbox合并""" + + for page_key, page_info in pdf_info_dict.items(): + page_info = merge_footnote_blocks(page_info, main_text_font) + page_info = remove_footnote_blocks(page_info) + pdf_info_dict[page_key] = page_info + + """进入pdf后置过滤器,去掉一些不合理的pdf""" + + i = 0 + for page_info in pdf_info_dict.values(): + is_good_pdf, err = pdf_post_filter(page_info) + if not is_good_pdf: + logger.warning(f"page_id: {i}, drop this pdf: {book_name}, reason: {err}") + if not debug_mode: + return err + i += 1 + + if debug_mode: + params_file_save_path = join_path( + save_tmp_path, "md", book_name, "preproc_out.json" + ) + page_draw_rect_save_path = join_path( + save_tmp_path, "md", book_name, "layout.pdf" + ) + # dir_path = os.path.dirname(page_draw_rect_save_path) + # if not os.path.exists(dir_path): + # # 如果目录不存在,创建它 + # os.makedirs(dir_path) + + with open(params_file_save_path, "w", encoding="utf-8") as f: + json.dump(pdf_info_dict, f, ensure_ascii=False, indent=4) + # 先检测本地 page_draw_rect_save_path 是否存在,如果存在则删除 + if os.path.exists(page_draw_rect_save_path): + os.remove(page_draw_rect_save_path) + # 绘制bbox和layout到pdf + draw_bbox_on_page(pdf_docs, pdf_info_dict, page_draw_rect_save_path) + draw_layout_bbox_on_page( + pdf_docs, pdf_info_dict, header, footer, page_draw_rect_save_path + ) + + if debug_mode: + # 打印后处理阶段耗时 + logger.info(f"post_processing_time: {get_delta_time(start_time)}") + + """ + ================================================================================================================================== + 进入段落处理-2阶段 + """ + + # 处理行内文字间距较大问题 + pdf_info_dict = solve_inline_too_large_interval(pdf_info_dict) + + start_time = time.time() + + para_process_pipeline = ParaProcessPipeline() + + def _deal_with_text_exception(error_info): + logger.warning( + f"page_id: {page_id}, drop this pdf: {book_name}, reason: {error_info}" + ) + if error_info == denseSingleLineBlockException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.DENSE_SINGLE_LINE_BLOCK}" + ) + result = { + "need_drop": True, + "drop_reason": DropReason.DENSE_SINGLE_LINE_BLOCK, + } + return result + if error_info == titleDetectionException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_DETECTION_FAILED}" + ) + result = { + "need_drop": True, + "drop_reason": DropReason.TITLE_DETECTION_FAILED, + } + return result + elif error_info == titleLevelException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.TITLE_LEVEL_FAILED}" + ) + result = {"need_drop": True, "drop_reason": DropReason.TITLE_LEVEL_FAILED} + return result + elif error_info == paraSplitException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.PARA_SPLIT_FAILED}" + ) + result = {"need_drop": True, "drop_reason": DropReason.PARA_SPLIT_FAILED} + return result + elif error_info == paraMergeException_msg: + logger.warning( + f"Drop this pdf: {book_name}, reason: {DropReason.PARA_MERGE_FAILED}" + ) + result = {"need_drop": True, "drop_reason": DropReason.PARA_MERGE_FAILED} + return result + + if debug_mode: + input_pdf_file = f"{pdf_local_path}.pdf" + output_dir = f"{save_path}/{book_name}" + output_pdf_file = f"{output_dir}/pdf_annos.pdf" + + """ + Call the para_process_pipeline function to process the pdf_info_dict. + + Parameters: + para_debug_mode: str or None + If para_debug_mode is None, the para_process_pipeline will not keep any intermediate results. + If para_debug_mode is "simple", the para_process_pipeline will only keep the annos on the pdf and the final results as a json file. + If para_debug_mode is "full", the para_process_pipeline will keep all the intermediate results generated during each step. + """ + pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline( + pdf_info_dict, + para_debug_mode="simple", + input_pdf_path=input_pdf_file, + output_pdf_path=output_pdf_file, + ) + # 打印段落处理阶段耗时 + logger.info(f"para_process_time: {get_delta_time(start_time)}") + + # debug的时候不return drop信息 + if error_info is not None: + _deal_with_text_exception(error_info) + return pdf_info_dict + else: + pdf_info_dict, error_info = para_process_pipeline.para_process_pipeline( + pdf_info_dict + ) + if error_info is not None: + return _deal_with_text_exception(error_info) + + return pdf_info_dict diff --git a/magic_pdf/pipeline.py b/magic_pdf/pipeline.py index 3bae2225..d4d43a93 100644 --- a/magic_pdf/pipeline.py +++ b/magic_pdf/pipeline.py @@ -3,9 +3,19 @@ import time from urllib.parse import quote -from magic_pdf.dict2md.ocr_mkcontent import ocr_mk_nlp_markdown, ocr_mk_mm_markdown, ocr_mk_mm_standard_format, \ - ocr_mk_mm_markdown_with_para -from magic_pdf.libs.commons import read_file, join_path, parse_bucket_key, formatted_time, s3_image_save_path +from magic_pdf.dict2md.ocr_mkcontent import ( + ocr_mk_nlp_markdown, + ocr_mk_mm_markdown, + ocr_mk_mm_standard_format, + ocr_mk_mm_markdown_with_para, +) +from magic_pdf.libs.commons import ( + read_file, + join_path, + parse_bucket_key, + formatted_time, + s3_image_save_path, +) from magic_pdf.libs.drop_reason import DropReason from magic_pdf.libs.json_compressor import JsonCompressor from magic_pdf.dict2md.mkcontent import mk_nlp_markdown, mk_universal_format @@ -13,51 +23,54 @@ from magic_pdf.filter.pdf_classify_by_type import classify from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan from loguru import logger - +from magic_pdf.pdf_parse_for_train import parse_pdf_for_train +rom magic_pdf.train_utils.convert_to_train_format import convert_to_train_format from app.common.s3 import get_s3_config, get_s3_client from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr def exception_handler(jso: dict, e): logger.exception(e) - jso['need_drop'] = True - jso['drop_reason'] = DropReason.Exception - jso['exception'] = f"ERROR: {e}" + jso["need_drop"] = True + jso["drop_reason"] = DropReason.Exception + jso["exception"] = f"ERROR: {e}" return jso def get_data_type(jso: dict): - data_type = jso.get('data_type') + data_type = jso.get("data_type") if data_type is None: - data_type = jso.get('file_type') + data_type = jso.get("file_type") return data_type def get_bookid(jso: dict): - book_id = jso.get('bookid') + book_id = jso.get("bookid") if book_id is None: - book_id = jso.get('original_file_id') + book_id = jso.get("original_file_id") return book_id def get_data_source(jso: dict): - data_source = jso.get('data_source') + data_source = jso.get("data_source") if data_source is None: - data_source = jso.get('file_source') + data_source = jso.get("file_source") return data_source def meta_scan(jso: dict, doc_layout_check=True) -> dict: - s3_pdf_path = jso.get('file_location') + s3_pdf_path = jso.get("file_location") s3_config = get_s3_config(s3_pdf_path) if doc_layout_check: - if 'doc_layout_result' not in jso: # 检测json中是存在模型数据,如果没有则需要跳过该pdf - jso['need_drop'] = True - jso['drop_reason'] = DropReason.MISS_DOC_LAYOUT_RESULT + if ( + "doc_layout_result" not in jso + ): # 检测json中是存在模型数据,如果没有则需要跳过该pdf + jso["need_drop"] = True + jso["drop_reason"] = DropReason.MISS_DOC_LAYOUT_RESULT return jso try: data_source = get_data_source(jso) - file_id = jso.get('file_id') + file_id = jso.get("file_id") book_name = f"{data_source}/{file_id}" # 首页存在超量drawing问题 @@ -68,90 +81,111 @@ def meta_scan(jso: dict, doc_layout_check=True) -> dict: # return jso start_time = time.time() # 记录开始时间 - logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr) + logger.info( + f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", + file=sys.stderr, + ) file_content = read_file(s3_pdf_path, s3_config) read_file_time = int(time.time() - start_time) # 计算执行时间 start_time = time.time() # 记录开始时间 res = pdf_meta_scan(s3_pdf_path, file_content) - if res.get('need_drop', False): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析 - jso['need_drop'] = True - jso['drop_reason'] = res["drop_reason"] + if res.get( + "need_drop", False + ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析 + jso["need_drop"] = True + jso["drop_reason"] = res["drop_reason"] else: # 正常返回 - jso['pdf_meta'] = res - jso['content'] = "" - jso['remark'] = "" - jso['data_url'] = "" + jso["pdf_meta"] = res + jso["content"] = "" + jso["remark"] = "" + jso["data_url"] = "" end_time = time.time() # 记录结束时间 meta_scan_time = int(end_time - start_time) # 计算执行时间 - logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},read_file_time is:{read_file_time},meta_scan_time is:{meta_scan_time}", file=sys.stderr) - jso['read_file_time'] = read_file_time - jso['meta_scan_time'] = meta_scan_time + logger.info( + f"book_name is:{book_name},end_time is:{formatted_time(end_time)},read_file_time is:{read_file_time},meta_scan_time is:{meta_scan_time}", + file=sys.stderr, + ) + jso["read_file_time"] = read_file_time + jso["meta_scan_time"] = meta_scan_time except Exception as e: jso = exception_handler(jso, e) return jso def classify_by_type(jso: dict, debug_mode=False) -> dict: - #检测debug开关 + # 检测debug开关 if debug_mode: pass - else:# 如果debug没开,则检测是否有needdrop字段 - if jso.get('need_drop', False): + else: # 如果debug没开,则检测是否有needdrop字段 + if jso.get("need_drop", False): return jso # 开始正式逻辑 try: - pdf_meta = jso.get('pdf_meta') + pdf_meta = jso.get("pdf_meta") data_source = get_data_source(jso) - file_id = jso.get('file_id') + file_id = jso.get("file_id") book_name = f"{data_source}/{file_id}" total_page = pdf_meta["total_page"] page_width = pdf_meta["page_width_pts"] page_height = pdf_meta["page_height_pts"] img_sz_list = pdf_meta["image_info_per_page"] - img_num_list = pdf_meta['imgs_per_page'] - text_len_list = pdf_meta['text_len_per_page'] - text_layout_list = pdf_meta['text_layout_per_page'] - text_language = pdf_meta['text_language'] + img_num_list = pdf_meta["imgs_per_page"] + text_len_list = pdf_meta["text_len_per_page"] + text_layout_list = pdf_meta["text_layout_per_page"] + text_language = pdf_meta["text_language"] # allow_language = ['zh', 'en'] # 允许的语言,目前只允许简中和英文的 # if text_language not in allow_language: # 如果语言不在允许的语言中,则drop # jso['need_drop'] = True # jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE # return jso - pdf_path = pdf_meta['pdf_path'] - is_encrypted = pdf_meta['is_encrypted'] - is_needs_password = pdf_meta['is_needs_password'] - if is_encrypted or is_needs_password: # 加密的,需要密码的,没有页面的,都不处理 - jso['need_drop'] = True - jso['drop_reason'] = DropReason.ENCRYPTED + pdf_path = pdf_meta["pdf_path"] + is_encrypted = pdf_meta["is_encrypted"] + is_needs_password = pdf_meta["is_needs_password"] + if ( + is_encrypted or is_needs_password + ): # 加密的,需要密码的,没有页面的,都不处理 + jso["need_drop"] = True + jso["drop_reason"] = DropReason.ENCRYPTED else: start_time = time.time() # 记录开始时间 - is_text_pdf, results = classify(pdf_path, total_page, page_width, page_height, img_sz_list, text_len_list, img_num_list, text_layout_list) + is_text_pdf, results = classify( + pdf_path, + total_page, + page_width, + page_height, + img_sz_list, + text_len_list, + img_num_list, + text_layout_list, + ) classify_time = int(time.time() - start_time) # 计算执行时间 if is_text_pdf: - pdf_meta['is_text_pdf'] = is_text_pdf - jso['pdf_meta'] = pdf_meta - jso['classify_time'] = classify_time + pdf_meta["is_text_pdf"] = is_text_pdf + jso["pdf_meta"] = pdf_meta + jso["classify_time"] = classify_time # print(json.dumps(pdf_meta, ensure_ascii=False)) - allow_language = ['zh', 'en'] # 允许的语言,目前只允许简中和英文的 - if text_language not in allow_language: # 如果语言不在允许的语言中,则drop - jso['need_drop'] = True - jso['drop_reason'] = DropReason.NOT_ALLOW_LANGUAGE + allow_language = ["zh", "en"] # 允许的语言,目前只允许简中和英文的 + if ( + text_language not in allow_language + ): # 如果语言不在允许的语言中,则drop + jso["need_drop"] = True + jso["drop_reason"] = DropReason.NOT_ALLOW_LANGUAGE return jso else: # 先不drop - pdf_meta['is_text_pdf'] = is_text_pdf - jso['pdf_meta'] = pdf_meta - jso['classify_time'] = classify_time - jso['need_drop'] = True - jso['drop_reason'] = DropReason.NOT_IS_TEXT_PDF + pdf_meta["is_text_pdf"] = is_text_pdf + jso["pdf_meta"] = pdf_meta + jso["classify_time"] = classify_time + jso["need_drop"] = True + jso["drop_reason"] = DropReason.NOT_IS_TEXT_PDF extra_info = {"classify_rules": []} for condition, result in results.items(): if not result: extra_info["classify_rules"].append(condition) - jso['extra_info'] = extra_info + jso["extra_info"] = extra_info except Exception as e: jso = exception_handler(jso, e) @@ -162,48 +196,69 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict: if debug_mode: pass - else:# 如果debug没开,则检测是否有needdrop字段 - if jso.get('need_drop', False): - logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr) + else: # 如果debug没开,则检测是否有needdrop字段 + if jso.get("need_drop", False): + logger.info( + f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", + file=sys.stderr, + ) jso["dropped"] = True return jso try: data_source = get_data_source(jso) - file_id = jso.get('file_id') + file_id = jso.get("file_id") book_name = f"{data_source}/{file_id}" - title = jso.get('title') - url_encode_title = quote(title, safe='') - if data_source != 'scihub': + title = jso.get("title") + url_encode_title = quote(title, safe="") + if data_source != "scihub": return jso - pdf_intermediate_dict = jso['pdf_intermediate_dict'] + pdf_intermediate_dict = jso["pdf_intermediate_dict"] # 将 pdf_intermediate_dict 解压 pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) i = 0 for page in pdf_intermediate_dict.values(): - if page.get('tables'): - if len(page['tables']) > 0: + if page.get("tables"): + if len(page["tables"]) > 0: j = 0 - for table in page['tables']: + for table in page["tables"]: if debug_mode: - image_path = join_path("s3://mllm-raw-media/pdf2md_img/", book_name, table['image_path']) + image_path = join_path( + "s3://mllm-raw-media/pdf2md_img/", + book_name, + table["image_path"], + ) else: - image_path = join_path("s3://mllm-raw-media/pdf2md_img/", table['image_path']) + image_path = join_path( + "s3://mllm-raw-media/pdf2md_img/", table["image_path"] + ) - if image_path.endswith('.jpg'): + if image_path.endswith(".jpg"): j += 1 s3_client = get_s3_client(image_path) bucket_name, bucket_key = parse_bucket_key(image_path) # 通过s3_client获取图片到内存 - image_bytes = s3_client.get_object(Bucket=bucket_name, Key=bucket_key)['Body'].read() + image_bytes = s3_client.get_object( + Bucket=bucket_name, Key=bucket_key + )["Body"].read() # 保存图片到新的位置 if debug_mode: - new_image_path = join_path("s3://mllm-raw-media/pdf2md_img/table_new/", url_encode_title + "_" + table['image_path'].lstrip('tables/')) + new_image_path = join_path( + "s3://mllm-raw-media/pdf2md_img/table_new/", + url_encode_title + + "_" + + table["image_path"].lstrip("tables/"), + ) else: - new_image_path = join_path("s3://mllm-raw-media/pdf2md_img/table_new/", url_encode_title + f"_page{i}_{j}.jpg") + new_image_path = join_path( + "s3://mllm-raw-media/pdf2md_img/table_new/", + url_encode_title + f"_page{i}_{j}.jpg", + ) logger.info(new_image_path, file=sys.stderr) bucket_name, bucket_key = parse_bucket_key(new_image_path) - s3_client.put_object(Bucket=bucket_name, Key=bucket_key, Body=image_bytes) + s3_client.put_object( + Bucket=bucket_name, Key=bucket_key, Body=image_bytes + ) else: continue i += 1 @@ -218,8 +273,11 @@ def save_tables_to_s3(jso: dict, debug_mode=False) -> dict: def drop_needdrop_pdf(jso: dict) -> dict: - if jso.get('need_drop', False): - logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", file=sys.stderr) + if jso.get("need_drop", False): + logger.info( + f"book_name is:{get_data_source(jso)}/{jso['file_id']} need drop", + file=sys.stderr, + ) jso["dropped"] = True return jso @@ -228,19 +286,19 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict: if debug_mode: pass - else:# 如果debug没开,则检测是否有needdrop字段 - if jso.get('need_drop', False): - book_name = join_path(get_data_source(jso), jso['file_id']) + else: # 如果debug没开,则检测是否有needdrop字段 + if jso.get("need_drop", False): + book_name = join_path(get_data_source(jso), jso["file_id"]) logger.info(f"book_name is:{book_name} need drop", file=sys.stderr) jso["dropped"] = True return jso try: - pdf_intermediate_dict = jso['pdf_intermediate_dict'] + pdf_intermediate_dict = jso["pdf_intermediate_dict"] # 将 pdf_intermediate_dict 解压 pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) - #markdown_content = mk_nlp_markdown(pdf_intermediate_dict) - jso['content_list'] = mk_universal_format(pdf_intermediate_dict) - #jso["content"] = markdown_content + # markdown_content = mk_nlp_markdown(pdf_intermediate_dict) + jso["content_list"] = mk_universal_format(pdf_intermediate_dict) + # jso["content"] = markdown_content logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']}") # 把无用的信息清空 jso["doc_layout_result"] = "" @@ -252,18 +310,18 @@ def pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict: def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: - #检测debug开关 + # 检测debug开关 if debug_mode: pass - else:# 如果debug没开,则检测是否有needdrop字段 - if jso.get('need_drop', False): + else: # 如果debug没开,则检测是否有needdrop字段 + if jso.get("need_drop", False): return jso # 开始正式逻辑 - s3_pdf_path = jso.get('file_location') + s3_pdf_path = jso.get("file_location") s3_config = get_s3_config(s3_pdf_path) - model_output_json_list = jso.get('doc_layout_result') + model_output_json_list = jso.get("doc_layout_result") data_source = get_data_source(jso) - file_id = jso.get('file_id') + file_id = jso.get("file_id") book_name = f"{data_source}/{file_id}" # 1.23.22已修复 @@ -275,15 +333,15 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: # jso['drop_reason'] = DropReason.SPECIAL_PDF # return jso - junk_img_bojids = jso['pdf_meta']['junk_img_bojids'] + junk_img_bojids = jso["pdf_meta"]["junk_img_bojids"] # total_page = jso['pdf_meta']['total_page'] # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop - svgs_per_page_list = jso['pdf_meta']['svgs_per_page'] + svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"] max_svgs = max(svgs_per_page_list) if max_svgs > 3000: - jso['need_drop'] = True - jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS + jso["need_drop"] = True + jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS # elif total_page > 1000: # jso['need_drop'] = True # jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES @@ -293,44 +351,65 @@ def parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: image_s3_config = get_s3_config(save_path) start_time = time.time() # 记录开始时间 # 先打印一下book_name和解析开始的时间 - logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr) - pdf_info_dict = parse_pdf_by_model(s3_pdf_path, s3_config, model_output_json_list, save_path, - book_name, pdf_model_profile=None, - image_s3_config=image_s3_config, - start_page_id=start_page_id, junk_img_bojids=junk_img_bojids, - debug_mode=debug_mode) - if pdf_info_dict.get('need_drop', False): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析 - jso['need_drop'] = True - jso['drop_reason'] = pdf_info_dict["drop_reason"] + logger.info( + f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", + file=sys.stderr, + ) + pdf_info_dict = parse_pdf_by_model( + s3_pdf_path, + s3_config, + model_output_json_list, + save_path, + book_name, + pdf_model_profile=None, + image_s3_config=image_s3_config, + start_page_id=start_page_id, + junk_img_bojids=junk_img_bojids, + debug_mode=debug_mode, + ) + if pdf_info_dict.get( + "need_drop", False + ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析 + jso["need_drop"] = True + jso["drop_reason"] = pdf_info_dict["drop_reason"] else: # 正常返回,将 pdf_info_dict 压缩并存储 pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict) - jso['pdf_intermediate_dict'] = pdf_info_dict + jso["pdf_intermediate_dict"] = pdf_info_dict end_time = time.time() # 记录完成时间 parse_time = int(end_time - start_time) # 计算执行时间 # 解析完成后打印一下book_name和耗时 - logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", file=sys.stderr) - jso['parse_time'] = parse_time + logger.info( + f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", + file=sys.stderr, + ) + jso["parse_time"] = parse_time except Exception as e: jso = exception_handler(jso, e) return jso -''' + +""" 统一处理逻辑 1.先调用parse_pdf对文本类pdf进行处理 2.再调用ocr_dropped_parse_pdf,对之前drop的pdf进行处理 -''' +""" + + def uni_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: jso = parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode) jso = ocr_dropped_parse_pdf(jso, start_page_id=start_page_id, debug_mode=debug_mode) return jso + # 专门用来跑被drop的pdf,跑完之后需要把need_drop字段置为false def ocr_dropped_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: - if not jso.get('need_drop', False): + if not jso.get("need_drop", False): return jso else: - jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode) - jso['need_drop'] = False + jso = ocr_parse_pdf_core( + jso, start_page_id=start_page_id, debug_mode=debug_mode + ) + jso["need_drop"] = False return jso @@ -339,7 +418,7 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: if debug_mode: pass else: # 如果debug没开,则检测是否有needdrop字段 - if jso.get('need_drop', False): + if jso.get("need_drop", False): return jso jso = ocr_parse_pdf_core(jso, start_page_id=start_page_id, debug_mode=debug_mode) @@ -347,18 +426,21 @@ def ocr_parse_pdf(jso: dict, start_page_id=0, debug_mode=False) -> dict: def ocr_parse_pdf_core(jso: dict, start_page_id=0, debug_mode=False) -> dict: - s3_pdf_path = jso.get('file_location') + s3_pdf_path = jso.get("file_location") s3_config = get_s3_config(s3_pdf_path) - model_output_json_list = jso.get('doc_layout_result') + model_output_json_list = jso.get("doc_layout_result") data_source = get_data_source(jso) - file_id = jso.get('file_id') + file_id = jso.get("file_id") book_name = f"{data_source}/{file_id}" try: save_path = s3_image_save_path image_s3_config = get_s3_config(save_path) start_time = time.time() # 记录开始时间 # 先打印一下book_name和解析开始的时间 - logger.info(f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", file=sys.stderr) + logger.info( + f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", + file=sys.stderr, + ) pdf_info_dict = parse_pdf_by_ocr( s3_pdf_path, s3_config, @@ -368,15 +450,18 @@ def ocr_parse_pdf_core(jso: dict, start_page_id=0, debug_mode=False) -> dict: pdf_model_profile=None, image_s3_config=image_s3_config, start_page_id=start_page_id, - debug_mode=debug_mode + debug_mode=debug_mode, ) pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict) - jso['pdf_intermediate_dict'] = pdf_info_dict + jso["pdf_intermediate_dict"] = pdf_info_dict end_time = time.time() # 记录完成时间 parse_time = int(end_time - start_time) # 计算执行时间 # 解析完成后打印一下book_name和耗时 - logger.info(f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", file=sys.stderr) - jso['parse_time'] = parse_time + logger.info( + f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", + file=sys.stderr, + ) + jso["parse_time"] = parse_time except Exception as e: jso = exception_handler(jso, e) return jso @@ -387,18 +472,21 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict: if debug_mode: pass else: # 如果debug没开,则检测是否有needdrop字段 - if jso.get('need_drop', False): - book_name = join_path(get_data_source(jso), jso['file_id']) + if jso.get("need_drop", False): + book_name = join_path(get_data_source(jso), jso["file_id"]) logger.info(f"book_name is:{book_name} need drop", file=sys.stderr) jso["dropped"] = True return jso try: - pdf_intermediate_dict = jso['pdf_intermediate_dict'] + pdf_intermediate_dict = jso["pdf_intermediate_dict"] # 将 pdf_intermediate_dict 解压 pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) markdown_content = ocr_mk_mm_markdown(pdf_intermediate_dict) jso["content"] = markdown_content - logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr) + logger.info( + f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", + file=sys.stderr, + ) # 把无用的信息清空 jso["doc_layout_result"] = "" jso["pdf_intermediate_dict"] = "" @@ -408,23 +496,28 @@ def ocr_pdf_intermediate_dict_to_markdown(jso: dict, debug_mode=False) -> dict: return jso -def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa(jso: dict, debug_mode=False) -> dict: +def ocr_pdf_intermediate_dict_to_markdown_with_para_for_qa( + jso: dict, debug_mode=False +) -> dict: if debug_mode: pass else: # 如果debug没开,则检测是否有needdrop字段 - if jso.get('need_drop', False): - book_name = join_path(get_data_source(jso), jso['file_id']) + if jso.get("need_drop", False): + book_name = join_path(get_data_source(jso), jso["file_id"]) logger.info(f"book_name is:{book_name} need drop", file=sys.stderr) jso["dropped"] = True return jso try: - pdf_intermediate_dict = jso['pdf_intermediate_dict'] + pdf_intermediate_dict = jso["pdf_intermediate_dict"] # 将 pdf_intermediate_dict 解压 pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) markdown_content = ocr_mk_mm_markdown_with_para(pdf_intermediate_dict) jso["content_ocr"] = markdown_content - logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", file=sys.stderr) + logger.info( + f"book_name is:{get_data_source(jso)}/{jso['file_id']},markdown content length is {len(markdown_content)}", + file=sys.stderr, + ) # 把无用的信息清空 jso["doc_layout_result"] = "" jso["pdf_intermediate_dict"] = "" @@ -439,18 +532,21 @@ def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) -> if debug_mode: pass else: # 如果debug没开,则检测是否有needdrop字段 - if jso.get('need_drop', False): - book_name = join_path(get_data_source(jso), jso['file_id']) + if jso.get("need_drop", False): + book_name = join_path(get_data_source(jso), jso["file_id"]) logger.info(f"book_name is:{book_name} need drop", file=sys.stderr) jso["dropped"] = True return jso try: - pdf_intermediate_dict = jso['pdf_intermediate_dict'] + pdf_intermediate_dict = jso["pdf_intermediate_dict"] # 将 pdf_intermediate_dict 解压 pdf_intermediate_dict = JsonCompressor.decompress_json(pdf_intermediate_dict) standard_format = ocr_mk_mm_standard_format(pdf_intermediate_dict) jso["content_list"] = standard_format - logger.info(f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}", file=sys.stderr) + logger.info( + f"book_name is:{get_data_source(jso)}/{jso['file_id']},content_list length is {len(standard_format)}", + file=sys.stderr, + ) # 把无用的信息清空 jso["doc_layout_result"] = "" jso["pdf_intermediate_dict"] = "" @@ -460,5 +556,85 @@ def ocr_pdf_intermediate_dict_to_standard_format(jso: dict, debug_mode=False) -> return jso +def parse_pdf_for_model_train(jso: dict, start_page_id=0, debug_mode=False) -> dict: + # 检测debug开关 + if debug_mode: + pass + else: # 如果debug没开,则检测是否有needdrop字段 + if jso.get("need_drop", False): + return jso + # 开始正式逻辑 + s3_pdf_path = jso.get("file_location") + s3_config = get_s3_config(s3_pdf_path) + model_output_json_list = jso.get("doc_layout_result") + data_source = get_data_source(jso) + file_id = jso.get("file_id") + book_name = f"{data_source}/{file_id}" + + # 1.23.22已修复 + # if debug_mode: + # pass + # else: + # if book_name == "zlib/zlib_21929367": + # jso['need_drop'] = True + # jso['drop_reason'] = DropReason.SPECIAL_PDF + # return jso + + junk_img_bojids = jso["pdf_meta"]["junk_img_bojids"] + # total_page = jso['pdf_meta']['total_page'] + + # 增加检测 max_svgs 数量的检测逻辑,如果 max_svgs 超过3000则drop + svgs_per_page_list = jso["pdf_meta"]["svgs_per_page"] + max_svgs = max(svgs_per_page_list) + if max_svgs > 3000: + jso["need_drop"] = True + jso["drop_reason"] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_SVGS + # elif total_page > 1000: + # jso['need_drop'] = True + # jso['drop_reason'] = DropReason.HIGH_COMPUTATIONAL_lOAD_BY_TOTAL_PAGES + else: + try: + save_path = s3_image_save_path + image_s3_config = get_s3_config(save_path) + start_time = time.time() # 记录开始时间 + # 先打印一下book_name和解析开始的时间 + logger.info( + f"book_name is:{book_name},start_time is:{formatted_time(start_time)}", + file=sys.stderr, + ) + pdf_info_dict = parse_pdf_for_train( + s3_pdf_path, + s3_config, + model_output_json_list, + save_path, + book_name, + pdf_model_profile=None, + image_s3_config=image_s3_config, + start_page_id=start_page_id, + junk_img_bojids=junk_img_bojids, + debug_mode=debug_mode, + ) + if pdf_info_dict.get( + "need_drop", False + ): # 如果返回的字典里有need_drop,则提取drop_reason并跳过本次解析 + jso["need_drop"] = True + jso["drop_reason"] = pdf_info_dict["drop_reason"] + else: # 正常返回,将 pdf_info_dict 压缩并存储 + pdf_info_dict = JsonCompressor.compress_json(pdf_info_dict) + jso["pdf_intermediate_dict"] = pdf_info_dict + jso["parsed_results"] = convert_to_train_format(pdf_info_dict) + end_time = time.time() # 记录完成时间 + parse_time = int(end_time - start_time) # 计算执行时间 + # 解析完成后打印一下book_name和耗时 + logger.info( + f"book_name is:{book_name},end_time is:{formatted_time(end_time)},cost_time is:{parse_time}", + file=sys.stderr, + ) + jso["parse_time"] = parse_time + except Exception as e: + jso = exception_handler(jso, e) + return jso + + if __name__ == "__main__": pass diff --git a/magic_pdf/train_utils/__init__.py b/magic_pdf/train_utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/magic_pdf/train_utils/convert_to_train_format.py b/magic_pdf/train_utils/convert_to_train_format.py new file mode 100644 index 00000000..dcd99c5d --- /dev/null +++ b/magic_pdf/train_utils/convert_to_train_format.py @@ -0,0 +1,52 @@ + + +def convert_to_train_format(jso: dict) -> []: + pages = [] + for k, v in jso.items(): + page_idx = v["page_idx"] + width, height = v["page_size"] + + info = {"page_info": {"page_no": page_idx, "height": height, "width": width}} + + bboxes: list[dict] = [] + for img_bbox in v["image_bboxes_with_caption"]: + bbox = {"category_id": 1, "bbox": img_bbox["bbox"]} + if "caption" in img_bbox: + bbox["caption_bbox"] = img_bbox["caption"] + bboxes.append(bbox) + + for tbl_bbox in v["table_bboxes_with_caption"]: + bbox = {"category_id": 7, "bbox": tbl_bbox["bbox"]} + if "caption" in tbl_bbox: + bbox["caption_bbox"] = tbl_bbox["caption"] + bboxes.append(bbox) + + for bbox in v["bak_page_no_bboxes"]: + n_bbox = {"category_id": 4, "bbox": bbox} + bboxes.append(n_bbox) + + for bbox in v["bak_header_bboxes"]: + n_bbox = {"category_id": 3, "bbox": bbox} + bboxes.append(n_bbox) + + for bbox in v["bak_footer_bboxes"]: + n_bbox = {"category_id": 6, "bbox": bbox} + bboxes.append(n_bbox) + + # 脚注, 目前没有看到例子 + for para in v["para_blocks"]: + n_bbox = {"category_id": 2, "bbox": para["bbox"]} + bboxes.append(n_bbox) + + for inline_equation in v["inline_equations"]: + n_bbox = {"category_id": 13, "bbox": inline_equation["bbox"]} + bboxes.append(n_bbox) + + for inter_equation in v["interline_equations"]: + n_bbox = {"category_id": 10, "bbox": inter_equation["bbox"]} + bboxes.append(n_bbox) + + info["bboxes"] = bboxes + pages.append(info) + + return pages diff --git a/magic_pdf/train_utils/extract_caption.py b/magic_pdf/train_utils/extract_caption.py new file mode 100644 index 00000000..74e0c51d --- /dev/null +++ b/magic_pdf/train_utils/extract_caption.py @@ -0,0 +1,59 @@ +from magic_pdf.libs.boxbase import _is_in + + +def extract_caption_bbox(outer: list, inner: list) -> list: + """ + ret: list of { + "bbox": [1,2,3,4], + "caption": [5,6,7,8] # may existed + } + + """ + found_count = 0 # for debug + print(outer, inner) + + def is_float_equal(a, b): + if 0.01 > abs(a - b): # non strict float equal compare + return True + return False + + outer_h = {i: outer[i] for i in range(len(outer))} + ret = [] + for v in inner: + ix0, iy0, ix1, iy1 = v + found_idx = None + d = {"bbox": v[:4]} + for k in outer_h: + ox0, oy0, ox1, oy1 = outer_h[k] + equal_float_flags = [ + is_float_equal(ix0, ox0), + is_float_equal(iy0, oy0), + is_float_equal(ix1, ox1), + is_float_equal(iy1, oy1), + ] + if _is_in(v, outer_h[k]) and not all(equal_float_flags): + found_idx = k + break + if found_idx is not None: + found_count += 1 + captions: list[list] = [] + ox0, oy0, ox1, oy1 = outer_h[found_idx] + captions = [ + [ox0, oy0, ix0, oy1], + [ox0, oy0, ox1, iy0], + [ox0, iy1, ox1, oy1], + [ix1, oy0, ox1, oy1], + ] + captions = sorted( + captions, + key=lambda rect: abs(rect[0] - rect[2]) * abs(rect[1] - rect[3]), + ) # 面积最大的框就是caption + d["caption"] = captions[-1] + outer_h.pop( + found_idx + ) # 同一个 outer box 只能用于确定一个 inner box 的 caption 位置。 + + ret.append(d) + + print("found_count: ", found_count) + return ret diff --git a/magic_pdf/train_utils/remove_footer_header.py b/magic_pdf/train_utils/remove_footer_header.py new file mode 100644 index 00000000..57bd3e9e --- /dev/null +++ b/magic_pdf/train_utils/remove_footer_header.py @@ -0,0 +1,159 @@ +import re + +from magic_pdf.libs.boxbase import _is_in_or_part_overlap +from magic_pdf.libs.drop_tag import CONTENT_IN_FOOT_OR_HEADER, PAGE_NO + + +""" + copy from pre_proc/remove_footer_header.py +""" + + +def remove_headder_footer_one_page( + text_raw_blocks, + image_bboxes, + table_bboxes, + header_bboxs, + footer_bboxs, + page_no_bboxs, + page_w, + page_h, +): + """ + 删除页眉页脚,页码 + 从line级别进行删除,删除之后观察这个text-block是否是空的,如果是空的,则移动到remove_list中 + """ + if 1: + return image_bboxes, table_bboxes, text_raw_blocks, [], [], [] + + header = [] + footer = [] + if len(header) == 0: + model_header = header_bboxs + if model_header: + x0 = min([x for x, _, _, _ in model_header]) + y0 = min([y for _, y, _, _ in model_header]) + x1 = max([x1 for _, _, x1, _ in model_header]) + y1 = max([y1 for _, _, _, y1 in model_header]) + header = [x0, y0, x1, y1] + if len(footer) == 0: + model_footer = footer_bboxs + if model_footer: + x0 = min([x for x, _, _, _ in model_footer]) + y0 = min([y for _, y, _, _ in model_footer]) + x1 = max([x1 for _, _, x1, _ in model_footer]) + y1 = max([y1 for _, _, _, y1 in model_footer]) + footer = [x0, y0, x1, y1] + + header_y0 = 0 if len(header) == 0 else header[3] + footer_y0 = page_h if len(footer) == 0 else footer[1] + if page_no_bboxs: + top_part = [b for b in page_no_bboxs if b[3] < page_h / 2] + btn_part = [b for b in page_no_bboxs if b[1] > page_h / 2] + + top_max_y0 = max([b[1] for b in top_part]) if top_part else 0 + btn_min_y1 = min([b[3] for b in btn_part]) if btn_part else page_h + + header_y0 = max(header_y0, top_max_y0) + footer_y0 = min(footer_y0, btn_min_y1) + + content_boundry = [0, header_y0, page_w, footer_y0] + + header = [0, 0, page_w, header_y0] + footer = [0, footer_y0, page_w, page_h] + + """以上计算出来了页眉页脚的边界,下面开始进行删除""" + text_block_to_remove = [] + # 首先检查每个textblock + for blk in text_raw_blocks: + if len(blk["lines"]) > 0: + for line in blk["lines"]: + line_del = [] + for span in line["spans"]: + span_del = [] + if span["bbox"][3] < header_y0: + span_del.append(span) + elif _is_in_or_part_overlap( + span["bbox"], header + ) or _is_in_or_part_overlap(span["bbox"], footer): + span_del.append(span) + for span in span_del: + line["spans"].remove(span) + if not line["spans"]: + line_del.append(line) + + for line in line_del: + blk["lines"].remove(line) + else: + # if not blk['lines']: + blk["tag"] = CONTENT_IN_FOOT_OR_HEADER + text_block_to_remove.append(blk) + + """有的时候由于pageNo太小了,总是会有一点和content_boundry重叠一点,被放入正文,因此对于pageNo,进行span粒度的删除""" + page_no_block_2_remove = [] + if page_no_bboxs: + for pagenobox in page_no_bboxs: + for block in text_raw_blocks: + if _is_in_or_part_overlap( + pagenobox, block["bbox"] + ): # 在span级别删除页码 + for line in block["lines"]: + for span in line["spans"]: + if _is_in_or_part_overlap(pagenobox, span["bbox"]): + # span['text'] = '' + span["tag"] = PAGE_NO + # 检查这个block是否只有这一个span,如果是,那么就把这个block也删除 + if len(line["spans"]) == 1 and len(block["lines"]) == 1: + page_no_block_2_remove.append(block) + else: + # 测试最后一个是不是页码:规则是,最后一个block仅有1个line,一个span,且text是数字,空格,符号组成,不含字母,并且包含数字 + if len(text_raw_blocks) > 0: + text_raw_blocks.sort(key=lambda x: x["bbox"][1], reverse=True) + last_block = text_raw_blocks[0] + if len(last_block["lines"]) == 1: + last_line = last_block["lines"][0] + if len(last_line["spans"]) == 1: + last_span = last_line["spans"][0] + if ( + last_span["text"].strip() + and not re.search("[a-zA-Z]", last_span["text"]) + and re.search("[0-9]", last_span["text"]) + ): + last_span["tag"] = PAGE_NO + page_no_block_2_remove.append(last_block) + + for b in page_no_block_2_remove: + text_block_to_remove.append(b) + + for blk in text_block_to_remove: + if blk in text_raw_blocks: + text_raw_blocks.remove(blk) + + text_block_remain = text_raw_blocks + image_bbox_to_remove = [ + bbox + for bbox in image_bboxes + if not _is_in_or_part_overlap(bbox, content_boundry) + ] + + image_bbox_remain = [ + bbox for bbox in image_bboxes if _is_in_or_part_overlap(bbox, content_boundry) + ] + table_bbox_to_remove = [ + bbox + for bbox in table_bboxes + if not _is_in_or_part_overlap(bbox, content_boundry) + ] + table_bbox_remain = [ + bbox for bbox in table_bboxes if _is_in_or_part_overlap(bbox, content_boundry) + ] + + # 1, 2, 3 + return ( + image_bbox_remain, + table_bbox_remain, + text_block_remain, + text_block_to_remove, + image_bbox_to_remove, + table_bbox_to_remove, + ) diff --git a/magic_pdf/train_utils/vis_utils.py b/magic_pdf/train_utils/vis_utils.py new file mode 100644 index 00000000..996ae514 --- /dev/null +++ b/magic_pdf/train_utils/vis_utils.py @@ -0,0 +1,327 @@ +from magic_pdf.libs.commons import fitz +import os +from magic_pdf.libs.coordinate_transform import get_scale_ratio + + +def draw_model_output( + raw_pdf_doc: fitz.Document, paras_dict_arr: list[dict], save_path: str +): + """ + 在page上画出bbox,保存到save_path + """ + """ + + # {0: 'title', # 标题 + # 1: 'figure', # 图片 + # 2: 'plain text', # 文本 + # 3: 'header', # 页眉 + # 4: 'page number', # 页码 + # 5: 'footnote', # 脚注 + # 6: 'footer', # 页脚 + # 7: 'table', # 表格 + # 8: 'table caption', # 表格描述 + # 9: 'figure caption', # 图片描述 + # 10: 'equation', # 公式 + # 11: 'full column', # 单栏 + # 12: 'sub column', # 多栏 + # 13: 'embedding', # 嵌入公式 + # 14: 'isolated'} # 单行公式 + + """ + + color_map = { + "body": fitz.pdfcolor["green"], + "non_body": fitz.pdfcolor["red"], + } + """ + {"layout_dets": [], "subfield_dets": [], "page_info": {"page_no": 22, "height": 1650, "width": 1275}} + """ + for i, page in enumerate(raw_pdf_doc): + v = paras_dict_arr[i] + page_idx = v["page_info"]["page_no"] + width = v["page_info"]["width"] + height = v["page_info"]["height"] + + horizontal_scale_ratio, vertical_scale_ratio = get_scale_ratio( + paras_dict_arr[i], page + ) + + for order, block in enumerate(v["layout_dets"]): + L = block["poly"][0] / horizontal_scale_ratio + U = block["poly"][1] / vertical_scale_ratio + R = block["poly"][2] / horizontal_scale_ratio + D = block["poly"][5] / vertical_scale_ratio + # L += pageL # 有的页面,artBox偏移了。不在(0,0) + # R += pageL + # U += pageU + # D += pageU + L, R = min(L, R), max(L, R) + U, D = min(U, D), max(U, D) + bbox = [L, U, R, D] + color = color_map["body"] + if block["category_id"] in (3, 4, 5, 6, 0): + color = color_map["non_body"] + + rect = fitz.Rect(bbox) + page.draw_rect(rect, fill=None, width=0.5, overlay=True, color=color) + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + raw_pdf_doc.save(save_path) + + +def debug_show_bbox( + raw_pdf_doc: fitz.Document, + page_idx: int, + bboxes: list, + droped_bboxes: list, + expect_drop_bboxes: list, + save_path: str, + expected_page_id: int, +): + """ + 以覆盖的方式写个临时的pdf,用于debug + """ + if page_idx != expected_page_id: + return + + if os.path.exists(save_path): + # 删除已经存在的文件 + os.remove(save_path) + # 创建一个新的空白 PDF 文件 + doc = fitz.open("") + + width = raw_pdf_doc[page_idx].rect.width + height = raw_pdf_doc[page_idx].rect.height + new_page = doc.new_page(width=width, height=height) + + shape = new_page.new_shape() + for bbox in bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish( + color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2 + ) + shape.finish() + shape.commit() + + for bbox in droped_bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in expect_drop_bboxes: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor["red"], fill=None) + shape.finish() + shape.commit() + + # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(bboxes)}", fontname="helv", fontsize=12, + # color=(0, 0, 0)) + # shape.finish(color=fitz.pdfcolor['black']) + # shape.commit() + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + doc.save(save_path) + doc.close() + + +def debug_show_page( + page, + bboxes1: list, + bboxes2: list, + bboxes3: list, +): + save_path = "./tmp/debug.pdf" + if os.path.exists(save_path): + # 删除已经存在的文件 + os.remove(save_path) + # 创建一个新的空白 PDF 文件 + doc = fitz.open("") + + width = page.rect.width + height = page.rect.height + new_page = doc.new_page(width=width, height=height) + + shape = new_page.new_shape() + for bbox in bboxes1: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish( + color=fitz.pdfcolor["red"], fill=fitz.pdfcolor["blue"], fill_opacity=0.2 + ) + shape.finish() + shape.commit() + + for bbox in bboxes2: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=None, fill=fitz.pdfcolor["yellow"], fill_opacity=0.2) + shape.finish() + shape.commit() + + for bbox in bboxes3: + # 原始box画上去 + rect = fitz.Rect(*bbox[0:4]) + shape = new_page.new_shape() + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor["red"], fill=None) + shape.finish() + shape.commit() + + parent_dir = os.path.dirname(save_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + doc.save(save_path) + doc.close() + + +def draw_layout_bbox_on_page( + raw_pdf_doc: fitz.Document, paras_dict: dict, header, footer, pdf_path: str +): + """ + 在page上画出bbox,保存到save_path + """ + # 检查文件是否存在 + is_new_pdf = False + if os.path.exists(pdf_path): + # 打开现有的 PDF 文件 + doc = fitz.open(pdf_path) + else: + # 创建一个新的空白 PDF 文件 + is_new_pdf = True + doc = fitz.open("") + + for k, v in paras_dict.items(): + page_idx = v["page_idx"] + layouts = v["layout_bboxes"] + page = doc[page_idx] + shape = page.new_shape() + for order, layout in enumerate(layouts): + border_offset = 1 + rect_box = layout["layout_bbox"] + layout_label = layout["layout_label"] + fill_color = fitz.pdfcolor["pink"] if layout_label == "U" else None + rect_box = [ + rect_box[0] + 1, + rect_box[1] - border_offset, + rect_box[2] - 1, + rect_box[3] + border_offset, + ] + rect = fitz.Rect(*rect_box) + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.4) + """ + draw order text on layout box + """ + font_size = 10 + shape.insert_text( + (rect_box[0] + 1, rect_box[1] + font_size), + f"{order}", + fontsize=font_size, + color=(0, 0, 0), + ) + + """画上footer header""" + if header: + shape.draw_rect(fitz.Rect(header)) + shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2) + if footer: + shape.draw_rect(fitz.Rect(footer)) + shape.finish(color=None, fill=fitz.pdfcolor["black"], fill_opacity=0.2) + + shape.commit() + + if is_new_pdf: + doc.save(pdf_path) + else: + doc.saveIncr() + doc.close() + + +@DeprecationWarning +def draw_layout_on_page( + raw_pdf_doc: fitz.Document, page_idx: int, page_layout: list, pdf_path: str +): + """ + 把layout的box用红色边框花在pdf_path的page_idx上 + """ + + def draw(shape, layout, fill_color=fitz.pdfcolor["pink"]): + border_offset = 1 + rect_box = layout["layout_bbox"] + layout_label = layout["layout_label"] + sub_layout = layout["sub_layout"] + if len(sub_layout) == 0: + fill_color = fill_color if layout_label == "U" else None + rect_box = [ + rect_box[0] + 1, + rect_box[1] - border_offset, + rect_box[2] - 1, + rect_box[3] + border_offset, + ] + rect = fitz.Rect(*rect_box) + shape.draw_rect(rect) + shape.finish(color=fitz.pdfcolor["red"], fill=fill_color, fill_opacity=0.2) + # if layout_label=='U': + # bad_boxes = layout.get("bad_boxes", []) + # for bad_box in bad_boxes: + # rect = fitz.Rect(*bad_box) + # shape.draw_rect(rect) + # shape.finish(color=fitz.pdfcolor['red'], fill=fitz.pdfcolor['red'], fill_opacity=0.2) + # else: + # rect = fitz.Rect(*rect_box) + # shape.draw_rect(rect) + # shape.finish(color=fitz.pdfcolor['blue']) + + for sub_layout in sub_layout: + draw(shape, sub_layout) + shape.commit() + + # 检查文件是否存在 + is_new_pdf = False + if os.path.exists(pdf_path): + # 打开现有的 PDF 文件 + doc = fitz.open(pdf_path) + else: + # 创建一个新的空白 PDF 文件 + is_new_pdf = True + doc = fitz.open("") + + page = doc[page_idx] + shape = page.new_shape() + for order, layout in enumerate(page_layout): + draw(shape, layout, fitz.pdfcolor["yellow"]) + + # shape.insert_textbox(fitz.Rect(200, 0, 600, 20), f"total bboxes: {len(layout)}", fontname="helv", fontsize=12, + # color=(0, 0, 0)) + # shape.finish(color=fitz.pdfcolor['black']) + # shape.commit() + + parent_dir = os.path.dirname(pdf_path) + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + + if is_new_pdf: + doc.save(pdf_path) + else: + doc.saveIncr() + doc.close() From 390fdb2cd5769c1a273ab8ddf275413a8ee7e8d2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=AE=B8=E7=91=9E?= Date: Thu, 21 Mar 2024 12:27:49 +0800 Subject: [PATCH 2/2] fix: fix typo --- magic_pdf/pipeline.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magic_pdf/pipeline.py b/magic_pdf/pipeline.py index d4d43a93..c2008a19 100644 --- a/magic_pdf/pipeline.py +++ b/magic_pdf/pipeline.py @@ -24,7 +24,7 @@ from magic_pdf.filter.pdf_meta_scan import pdf_meta_scan from loguru import logger from magic_pdf.pdf_parse_for_train import parse_pdf_for_train -rom magic_pdf.train_utils.convert_to_train_format import convert_to_train_format +from magic_pdf.train_utils.convert_to_train_format import convert_to_train_format from app.common.s3 import get_s3_config, get_s3_client from magic_pdf.pdf_parse_by_ocr import parse_pdf_by_ocr