From fcea39d36b323de22af7161e5aa90e9b1b1affbd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Thu, 7 Mar 2024 20:41:41 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0ocr=E6=A8=A1=E5=BC=8F?= =?UTF-8?q?=E7=9A=84layout=E8=A7=A3=E6=9E=90=E5=8A=9F=E8=83=BD?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- demo/ocr_demo.py | 16 ++- magic_pdf/pdf_parse_by_ocr.py | 16 +-- magic_pdf/pre_proc/ocr_detect_layout.py | 123 ++++++++++++++++++ .../{libs => pre_proc}/ocr_dict_merge.py | 0 4 files changed, 140 insertions(+), 15 deletions(-) create mode 100644 magic_pdf/pre_proc/ocr_detect_layout.py rename magic_pdf/{libs => pre_proc}/ocr_dict_merge.py (100%) diff --git a/demo/ocr_demo.py b/demo/ocr_demo.py index 28569328..d4e90b6f 100644 --- a/demo/ocr_demo.py +++ b/demo/ocr_demo.py @@ -28,10 +28,12 @@ def read_json_file(file_path): if __name__ == '__main__': - ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json" - ocr_pdf_info = read_json_file(ocr_json_file_path) - pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info) - markdown_text = mk_nlp_markdown(pdf_info_dict) - logger.info(markdown_text) - save_markdown(markdown_text, ocr_json_file_path) - + ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_1(3).json" + try: + ocr_pdf_info = read_json_file(ocr_json_file_path) + pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info) + markdown_text = mk_nlp_markdown(pdf_info_dict) + logger.info(markdown_text) + save_markdown(markdown_text, ocr_json_file_path) + except Exception as e: + logger.error(e) diff --git a/magic_pdf/pdf_parse_by_ocr.py b/magic_pdf/pdf_parse_by_ocr.py index 1b72cc57..35a56dac 100644 --- a/magic_pdf/pdf_parse_by_ocr.py +++ b/magic_pdf/pdf_parse_by_ocr.py @@ -1,12 +1,12 @@ -from loguru import logger +from magic_pdf.pre_proc.ocr_detect_layout import layout_detect +from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans -from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans - -def construct_page_component(page_id, blocks): +def construct_page_component(page_id, blocks, layout_bboxes): return_dict = { 'preproc_blocks': blocks, 'page_idx': page_id, + 'layout_bboxes': layout_bboxes, } return return_dict @@ -74,9 +74,6 @@ def parse_pdf_by_ocr( lines = merge_spans_to_line(spans) # logger.info(lines) - # 从ocr_page_info中获取layout信息 - - # 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox blocks = [] for line in lines: @@ -85,8 +82,11 @@ def parse_pdf_by_ocr( "lines": [line], }) + # 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case) + layout_bboxes = layout_detect(ocr_page_info['subfield_dets']) + # 构造pdf_info_dict - page_info = construct_page_component(page_id, blocks) + page_info = construct_page_component(page_id, blocks, layout_bboxes) pdf_info_dict[f"page_{page_id}"] = page_info return pdf_info_dict diff --git a/magic_pdf/pre_proc/ocr_detect_layout.py b/magic_pdf/pre_proc/ocr_detect_layout.py new file mode 100644 index 00000000..be1daa52 --- /dev/null +++ b/magic_pdf/pre_proc/ocr_detect_layout.py @@ -0,0 +1,123 @@ +from magic_pdf.libs.boxbase import _is_part_overlap, _is_in + +def get_center_point(bbox): + """ + 根据边界框坐标信息,计算出该边界框的中心点坐标。 + Args: + bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。 + Returns: + list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。 + """ + return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2] + + +def get_area(bbox): + """ + 根据边界框坐标信息,计算出该边界框的面积。 + Args: + bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。 + Returns: + float: 该边界框的面积。 + """ + return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1]) + + +def adjust_layouts(layout_bboxes): + # 遍历所有布局框 + for i in range(len(layout_bboxes)): + # 遍历当前布局框之后的布局框 + for j in range(i + 1, len(layout_bboxes)): + # 判断两个布局框是否重叠 + if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]): + # 计算每个布局框的中心点坐标和面积 + center_i = get_center_point(layout_bboxes[i]["layout_bbox"]) + area_i = get_area(layout_bboxes[i]["layout_bbox"]) + + center_j = get_center_point(layout_bboxes[j]["layout_bbox"]) + area_j = get_area(layout_bboxes[j]["layout_bbox"]) + + # 计算横向和纵向的距离差 + dx = abs(center_i[0] - center_j[0]) + dy = abs(center_i[1] - center_j[1]) + + # 较大布局框和较小布局框的赋值 + if area_i > area_j: + larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j] + else: + larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i] + + # 根据距离差判断重叠方向并修正边界 + if dx > dy: # 左右重叠 + if larger_layout["layout_bbox"][0] < smaller_layout["layout_bbox"][2]: + larger_layout["layout_bbox"][0] = smaller_layout["layout_bbox"][2] + else: + larger_layout["layout_bbox"][2] = smaller_layout["layout_bbox"][0] + else: # 上下重叠 + if larger_layout["layout_bbox"][1] < smaller_layout["layout_bbox"][3]: + larger_layout["layout_bbox"][1] = smaller_layout["layout_bbox"][3] + else: + larger_layout["layout_bbox"][3] = smaller_layout["layout_bbox"][1] + + # 返回排序调整后的布局边界框列表 + return layout_bboxes + + + + +def layout_detect(layout_info): + """ + 对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。 + + Args: + layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。 + + Returns: + list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。 + + """ + # 初始化布局边界框列表 + layout_bboxes = [] + # 遍历每个子布局 + for sub_layout in layout_info: + # 提取子布局的边界框坐标信息 + x0, y0, _, _, x1, y1, _, _ = sub_layout['poly'] + # 创建子布局的边界框字典 + layout_bbox = { + "layout_bbox": [x0, y0, x1, y1], + } + # 将子布局的边界框添加到列表中 + layout_bboxes.append(layout_bbox) + + # 初始化新的布局边界框列表 + new_layout_bboxes = [] + # 遍历每个布局边界框 + for i in range(len(layout_bboxes)): + # 初始化标记变量,用于判断当前边界框是否需要保留 + keep = True + # 获取当前边界框的坐标信息 + box_i = layout_bboxes[i]["layout_bbox"] + + # 遍历其他边界框 + for j in range(len(layout_bboxes)): + # 排除当前边界框自身 + if i != j: + # 获取其他边界框的坐标信息 + box_j = layout_bboxes[j]["layout_bbox"] + # 检测box_i是否被box_j包含 + if _is_in(box_i, box_j): + # 如果当前边界框被其他边界框包含,则标记为不需要保留 + keep = False + # 跳出内层循环 + break + + # 如果当前边界框需要保留,则添加到新的布局边界框列表中 + if keep: + new_layout_bboxes.append(layout_bboxes[i]) + + # 对新的布局边界框列表进行排序调整 + layout_bboxes = adjust_layouts(new_layout_bboxes) + + # 返回排序调整后的布局边界框列表 + return layout_bboxes + + diff --git a/magic_pdf/libs/ocr_dict_merge.py b/magic_pdf/pre_proc/ocr_dict_merge.py similarity index 100% rename from magic_pdf/libs/ocr_dict_merge.py rename to magic_pdf/pre_proc/ocr_dict_merge.py