Skip to content

Commit

Permalink
Merge branch 'magicpdf:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
myhloli authored Apr 22, 2024
2 parents d7128a9 + c23883b commit 81eeef3
Showing 1 changed file with 11 additions and 2 deletions.
13 changes: 11 additions & 2 deletions magic_pdf/para/para_split_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,8 @@ def __valign_lines(blocks, layout_bboxes):

for layout_box in layout_bboxes:
blocks_in_layoutbox = [b for b in blocks if b["type"] == BlockType.Text and is_in_layout(b['bbox'], layout_box['layout_bbox'])]
if len(blocks_in_layoutbox) == 0:
if len(blocks_in_layoutbox) == 0 or len(blocks_in_layoutbox[0]["lines"]) == 0:
new_layout_bboxes.append(layout_box['layout_bbox'])
continue

x0_lst = np.array([[line['bbox'][0], 0] for block in blocks_in_layoutbox for line in block['lines']])
Expand Down Expand Up @@ -400,6 +401,8 @@ def __connect_list_inter_page(pre_page_paras, next_page_paras, pre_page_layout_b
"""
if len(pre_page_paras) == 0 or len(next_page_paras) == 0: # 0的时候最后的return 会出错
return False
if len(pre_page_paras[-1]) == 0 or len(next_page_paras[0]) == 0:
return False
if pre_page_paras[-1][-1]["type"] != BlockType.Text or next_page_paras[0][0]["type"] != BlockType.Text:
return False
if pre_page_list_info[1] and not next_page_list_info[0]: # 前一个是列表结尾,后一个是非列表开头,此时检测是否有相同的缩进
Expand Down Expand Up @@ -693,4 +696,10 @@ def para_split(pdf_info_dict, debug_mode, lang="en"):
page_paras = page['para_blocks']
new_layout_bbox = new_layout_of_pages[page_num]
__connect_middle_align_text(page_paras, new_layout_bbox, page_num, lang, debug_mode=debug_mode)
__merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)
__merge_signle_list_text(page_paras, new_layout_bbox, page_num, lang)

# layout展平
for page_num, page in enumerate(pdf_info_dict.values()):
page_paras = page['para_blocks']
page_blocks = [block for layout in page_paras for block in layout]
page["para_blocks"] = page_blocks

0 comments on commit 81eeef3

Please sign in to comment.