Skip to content

Commit

Permalink
增加ocr模式的layout解析功能
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli committed Mar 7, 2024
1 parent 00f3e32 commit fcea39d
Show file tree
Hide file tree
Showing 4 changed files with 140 additions and 15 deletions.
16 changes: 9 additions & 7 deletions demo/ocr_demo.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@ def read_json_file(file_path):


if __name__ == '__main__':
ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_0.json"
ocr_pdf_info = read_json_file(ocr_json_file_path)
pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
markdown_text = mk_nlp_markdown(pdf_info_dict)
logger.info(markdown_text)
save_markdown(markdown_text, ocr_json_file_path)

ocr_json_file_path = r"D:\project\20231108code-clean\ocr\new\demo_4\ocr_1(3).json"
try:
ocr_pdf_info = read_json_file(ocr_json_file_path)
pdf_info_dict = parse_pdf_by_ocr(ocr_pdf_info)
markdown_text = mk_nlp_markdown(pdf_info_dict)
logger.info(markdown_text)
save_markdown(markdown_text, ocr_json_file_path)
except Exception as e:
logger.error(e)
16 changes: 8 additions & 8 deletions magic_pdf/pdf_parse_by_ocr.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from loguru import logger
from magic_pdf.pre_proc.ocr_detect_layout import layout_detect
from magic_pdf.pre_proc.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans

from magic_pdf.libs.ocr_dict_merge import merge_spans_to_line, remove_overlaps_min_spans


def construct_page_component(page_id, blocks):
def construct_page_component(page_id, blocks, layout_bboxes):
return_dict = {
'preproc_blocks': blocks,
'page_idx': page_id,
'layout_bboxes': layout_bboxes,
}
return return_dict

Expand Down Expand Up @@ -74,9 +74,6 @@ def parse_pdf_by_ocr(
lines = merge_spans_to_line(spans)
# logger.info(lines)

# 从ocr_page_info中获取layout信息


# 目前不做block拼接,先做个结构,每个block中只有一个line,block的bbox就是line的bbox
blocks = []
for line in lines:
Expand All @@ -85,8 +82,11 @@ def parse_pdf_by_ocr(
"lines": [line],
})

# 从ocr_page_info中解析layout信息(按自然阅读方向排序,并修复重叠和交错的bad case)
layout_bboxes = layout_detect(ocr_page_info['subfield_dets'])

# 构造pdf_info_dict
page_info = construct_page_component(page_id, blocks)
page_info = construct_page_component(page_id, blocks, layout_bboxes)
pdf_info_dict[f"page_{page_id}"] = page_info

return pdf_info_dict
Expand Down
123 changes: 123 additions & 0 deletions magic_pdf/pre_proc/ocr_detect_layout.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from magic_pdf.libs.boxbase import _is_part_overlap, _is_in

def get_center_point(bbox):
"""
根据边界框坐标信息,计算出该边界框的中心点坐标。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
list: 中心点坐标信息,包含两个元素,分别为x坐标和y坐标。
"""
return [(bbox[0] + bbox[2]) / 2, (bbox[1] + bbox[3]) / 2]


def get_area(bbox):
"""
根据边界框坐标信息,计算出该边界框的面积。
Args:
bbox (list): 边界框坐标信息,包含四个元素,分别为左上角x坐标、左上角y坐标、右下角x坐标、右下角y坐标。
Returns:
float: 该边界框的面积。
"""
return (bbox[2] - bbox[0]) * (bbox[3] - bbox[1])


def adjust_layouts(layout_bboxes):
# 遍历所有布局框
for i in range(len(layout_bboxes)):
# 遍历当前布局框之后的布局框
for j in range(i + 1, len(layout_bboxes)):
# 判断两个布局框是否重叠
if _is_part_overlap(layout_bboxes[i], layout_bboxes[j]):
# 计算每个布局框的中心点坐标和面积
center_i = get_center_point(layout_bboxes[i]["layout_bbox"])
area_i = get_area(layout_bboxes[i]["layout_bbox"])

center_j = get_center_point(layout_bboxes[j]["layout_bbox"])
area_j = get_area(layout_bboxes[j]["layout_bbox"])

# 计算横向和纵向的距离差
dx = abs(center_i[0] - center_j[0])
dy = abs(center_i[1] - center_j[1])

# 较大布局框和较小布局框的赋值
if area_i > area_j:
larger_layout, smaller_layout = layout_bboxes[i], layout_bboxes[j]
else:
larger_layout, smaller_layout = layout_bboxes[j], layout_bboxes[i]

# 根据距离差判断重叠方向并修正边界
if dx > dy: # 左右重叠
if larger_layout["layout_bbox"][0] < smaller_layout["layout_bbox"][2]:
larger_layout["layout_bbox"][0] = smaller_layout["layout_bbox"][2]
else:
larger_layout["layout_bbox"][2] = smaller_layout["layout_bbox"][0]
else: # 上下重叠
if larger_layout["layout_bbox"][1] < smaller_layout["layout_bbox"][3]:
larger_layout["layout_bbox"][1] = smaller_layout["layout_bbox"][3]
else:
larger_layout["layout_bbox"][3] = smaller_layout["layout_bbox"][1]

# 返回排序调整后的布局边界框列表
return layout_bboxes




def layout_detect(layout_info):
"""
对输入的布局信息进行解析,提取出每个子布局的边界框,并对所有子布局进行排序调整。
Args:
layout_info (list): 包含子布局信息的列表,每个子布局信息为字典类型,包含'poly'字段,表示子布局的边界框坐标信息。
Returns:
list: 经过排序调整后的所有子布局边界框信息的列表,每个边界框信息为字典类型,包含'layout_bbox'字段,表示边界框的坐标信息。
"""
# 初始化布局边界框列表
layout_bboxes = []
# 遍历每个子布局
for sub_layout in layout_info:
# 提取子布局的边界框坐标信息
x0, y0, _, _, x1, y1, _, _ = sub_layout['poly']
# 创建子布局的边界框字典
layout_bbox = {
"layout_bbox": [x0, y0, x1, y1],
}
# 将子布局的边界框添加到列表中
layout_bboxes.append(layout_bbox)

# 初始化新的布局边界框列表
new_layout_bboxes = []
# 遍历每个布局边界框
for i in range(len(layout_bboxes)):
# 初始化标记变量,用于判断当前边界框是否需要保留
keep = True
# 获取当前边界框的坐标信息
box_i = layout_bboxes[i]["layout_bbox"]

# 遍历其他边界框
for j in range(len(layout_bboxes)):
# 排除当前边界框自身
if i != j:
# 获取其他边界框的坐标信息
box_j = layout_bboxes[j]["layout_bbox"]
# 检测box_i是否被box_j包含
if _is_in(box_i, box_j):
# 如果当前边界框被其他边界框包含,则标记为不需要保留
keep = False
# 跳出内层循环
break

# 如果当前边界框需要保留,则添加到新的布局边界框列表中
if keep:
new_layout_bboxes.append(layout_bboxes[i])

# 对新的布局边界框列表进行排序调整
layout_bboxes = adjust_layouts(new_layout_bboxes)

# 返回排序调整后的布局边界框列表
return layout_bboxes


File renamed without changes.

0 comments on commit fcea39d

Please sign in to comment.