Skip to content

Commit

Permalink
Merge pull request #89 from myhloli/master
Browse files Browse the repository at this point in the history
fix complicated layout logic
  • Loading branch information
myhloli authored Apr 29, 2024
2 parents 92f6684 + 442f368 commit 0d2063f
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 30 deletions.
1 change: 1 addition & 0 deletions magic_pdf/libs/drop_reason.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@

class DropReason:
TEXT_BLCOK_HOR_OVERLAP = "text_block_horizontal_overlap" # 文字块有水平互相覆盖,导致无法准确定位文字顺序
USEFUL_BLOCK_HOR_OVERLAP = "useful_block_horizontal_overlap" # 需保留的block水平覆盖
COMPLICATED_LAYOUT = "complicated_layout" # 复杂的布局,暂时不支持
TOO_MANY_LAYOUT_COLUMNS = "too_many_layout_columns" # 目前不支持分栏超过2列的
COLOR_BACKGROUND_TEXT_BOX = "color_background_text_box" # 含有带色块的PDF,色块会改变阅读顺序,目前不支持带底色文字块的PDF。
Expand Down
50 changes: 36 additions & 14 deletions magic_pdf/pdf_parse_by_ocr_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,32 @@
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap


def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap:
logger.warning(
f"skip this page, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox)

return is_useful_block_horz_overlap, all_bboxes

def parse_pdf_by_ocr(pdf_bytes,
model_list,
imageWriter,
start_page_id=0,
end_page_id=None,
debug_mode=False,
):
need_drop = False
drop_reason = ""

pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes)

Expand Down Expand Up @@ -66,16 +85,14 @@ def parse_pdf_by_ocr(pdf_bytes,
interline_equations, page_w, page_h)

"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap:
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
continue

while True: # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
if is_useful_block_horz_overlap:
need_drop = True
drop_reason = DropReason.USEFUL_BLOCK_HOR_OVERLAP
else:
break

'''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h]
Expand All @@ -84,19 +101,23 @@ def parse_pdf_by_ocr(pdf_bytes,
if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
continue
need_drop = True
drop_reason = DropReason.CAN_NOT_DETECT_PAGE_LAYOUT

"""以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
continue
need_drop = True
drop_reason = DropReason.COMPLICATED_LAYOUT

layout_column_width = get_columns_cnt_of_layout(layout_tree)
if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
continue
need_drop = True
drop_reason = DropReason.TOO_MANY_LAYOUT_COLUMNS


'''根据layout顺序,对当前页面所有需要留下的block进行排序'''
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
Expand All @@ -119,7 +140,8 @@ def parse_pdf_by_ocr(pdf_bytes,

'''构造pdf_info_dict'''
page_info = ocr_construct_page_component_v2(fix_blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks)
images, tables, interline_equations, discarded_blocks,
need_drop, drop_reason)
pdf_info_dict[f"page_{page_id}"] = page_info

"""分段"""
Expand Down
48 changes: 35 additions & 13 deletions magic_pdf/pdf_parse_by_txt_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,22 @@
from magic_pdf.para.para_split_v2 import para_split
from magic_pdf.pre_proc.resolve_bbox_conflict import check_useful_block_horizontal_overlap

def remove_horizontal_overlap_block_which_smaller(all_bboxes):
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap, smaller_bbox = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap:
logger.warning(
f"skip this page, reason: {DropReason.USEFUL_BLOCK_HOR_OVERLAP}")
for bbox in all_bboxes.copy():
if smaller_bbox == bbox[:4]:
all_bboxes.remove(bbox)

return is_useful_block_horz_overlap, all_bboxes


def txt_spans_extract(pdf_page, inline_equations, interline_equations):
text_raw_blocks = pdf_page.get_text("dict", flags=fitz.TEXTFLAGS_TEXT)["blocks"]
Expand Down Expand Up @@ -91,6 +107,9 @@ def parse_pdf_by_txt(
end_page_id=None,
debug_mode=False,
):
need_drop = False
drop_reason = ""

pdf_bytes_md5 = compute_md5(pdf_bytes)
pdf_docs = fitz.open("pdf", pdf_bytes)

Expand Down Expand Up @@ -141,16 +160,14 @@ def parse_pdf_by_txt(
)

"""在切分之前,先检查一下bbox是否有左右重叠的情况,如果有,那么就认为这个pdf暂时没有能力处理好,这种左右重叠的情况大概率是由于pdf里的行间公式、表格没有被正确识别出来造成的 """
useful_blocks = []
for bbox in all_bboxes:
useful_blocks.append({
"bbox": bbox[:4]
})
is_useful_block_horz_overlap = check_useful_block_horizontal_overlap(useful_blocks)
if is_useful_block_horz_overlap:
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TEXT_BLCOK_HOR_OVERLAP}")
continue

while True: # 循环检查左右重叠的情况,如果存在就删除掉较小的那个bbox,直到不存在左右重叠的情况
is_useful_block_horz_overlap, all_bboxes = remove_horizontal_overlap_block_which_smaller(all_bboxes)
if is_useful_block_horz_overlap:
need_drop = True
drop_reason = DropReason.USEFUL_BLOCK_HOR_OVERLAP
else:
break

'''根据区块信息计算layout'''
page_boundry = [0, 0, page_w, page_h]
Expand All @@ -159,19 +176,22 @@ def parse_pdf_by_txt(
if len(text_blocks) > 0 and len(all_bboxes) > 0 and len(layout_bboxes) == 0:
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.CAN_NOT_DETECT_PAGE_LAYOUT}")
continue
need_drop = True
drop_reason = DropReason.CAN_NOT_DETECT_PAGE_LAYOUT

"""以下去掉复杂的布局和超过2列的布局"""
if any([lay["layout_label"] == LAYOUT_UNPROC for lay in layout_bboxes]): # 复杂的布局
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.COMPLICATED_LAYOUT}")
continue
need_drop = True
drop_reason = DropReason.COMPLICATED_LAYOUT

layout_column_width = get_columns_cnt_of_layout(layout_tree)
if layout_column_width > 2: # 去掉超过2列的布局pdf
logger.warning(
f"pdf: {pdf_bytes_md5}, skip this page, page_id: {page_id}, reason: {DropReason.TOO_MANY_LAYOUT_COLUMNS}")
continue
need_drop = True
drop_reason = DropReason.TOO_MANY_LAYOUT_COLUMNS

"""根据layout顺序,对当前页面所有需要留下的block进行排序"""
sorted_blocks = sort_blocks_by_layout(all_bboxes, layout_bboxes)
Expand Down Expand Up @@ -211,6 +231,8 @@ def parse_pdf_by_txt(
tables,
interline_equations,
discarded_blocks,
need_drop,
drop_reason
)
pdf_info_dict[f"page_{page_id}"] = page_info

Expand Down
4 changes: 3 additions & 1 deletion magic_pdf/pre_proc/construct_page_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ def ocr_construct_page_component(blocks, layout_bboxes, page_id, page_w, page_h,


def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page_h, layout_tree,
images, tables, interline_equations, discarded_blocks):
images, tables, interline_equations, discarded_blocks, need_drop, drop_reason):
return_dict = {
'preproc_blocks': blocks,
'layout_bboxes': layout_bboxes,
Expand All @@ -66,5 +66,7 @@ def ocr_construct_page_component_v2(blocks, layout_bboxes, page_id, page_w, page
'tables': tables,
'interline_equations': interline_equations,
'discarded_blocks': discarded_blocks,
'need_drop': need_drop,
'drop_reason': drop_reason,
}
return return_dict
9 changes: 7 additions & 2 deletions magic_pdf/pre_proc/resolve_bbox_conflict.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,12 @@ def check_useful_block_horizontal_overlap(useful_blocks: list) -> bool:

for i in range(len(useful_bboxes)):
for j in range(i + 1, len(useful_bboxes)):
area_i = (useful_bboxes[i][2] - useful_bboxes[i][0]) * (useful_bboxes[i][3] - useful_bboxes[i][1])
area_j = (useful_bboxes[j][2] - useful_bboxes[j][0]) * (useful_bboxes[j][3] - useful_bboxes[j][1])
if _is_left_overlap(useful_bboxes[i], useful_bboxes[j]) or _is_left_overlap(useful_bboxes[j], useful_bboxes[i]):
return True
if area_i > area_j:
return True, useful_bboxes[j]
else:
return True, useful_bboxes[i]

return False
return False, None

0 comments on commit 0d2063f

Please sign in to comment.