Skip to content

Commit

Permalink
Merge pull request #53 from papayalove/master
Browse files Browse the repository at this point in the history
更新了para_split
  • Loading branch information
myhloli authored Apr 23, 2024
2 parents bdf6e60 + a02a356 commit 179ab59
Showing 1 changed file with 13 additions and 9 deletions.
22 changes: 13 additions & 9 deletions magic_pdf/para/para_split_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,21 @@ def split_indices(slen, index_array):
"""
for l in lines:
first_char = __get_span_text(l['spans'][0])[0]
layout_left = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)[0]
if l['bbox'][0] == layout_left:
if first_char.isupper() or first_char.isdigit():
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
layout = __find_layout_bbox_by_line(l['bbox'], new_layout_bboxes)
if not layout:
line_fea_encode.append(0)
else:
if first_char.isupper():
line_fea_encode.append(2)
layout_left = layout[0]
if l['bbox'][0] == layout_left:
if first_char.isupper() or first_char.isdigit():
line_fea_encode.append(1)
else:
line_fea_encode.append(4)
else:
line_fea_encode.append(3)
if first_char.isupper():
line_fea_encode.append(2)
else:
line_fea_encode.append(3)

# 然后根据编码进行分段, 选出来 1,2,3连续出现至少2次的行,认为是列表。

Expand Down

0 comments on commit 179ab59

Please sign in to comment.