Skip to content

Commit

Permalink
Merge pull request #1329 from myhloli/dev
Browse files Browse the repository at this point in the history
feat(pre_proc): add function to remove overlapping characters in spans
  • Loading branch information
myhloli authored Dec 19, 2024
2 parents 7248676 + 2f4d4b0 commit 5eb9fee
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 1 deletion.
7 changes: 6 additions & 1 deletion magic_pdf/pdf_parse_union_core_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@
from magic_pdf.pre_proc.cut_image import ocr_cut_image_and_table
from magic_pdf.pre_proc.ocr_detect_all_bboxes import ocr_prepare_bboxes_for_layout_split_v2
from magic_pdf.pre_proc.ocr_dict_merge import fill_spans_in_blocks, fix_block_spans_v2, fix_discarded_block
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, remove_overlaps_min_spans
from magic_pdf.pre_proc.ocr_span_list_modify import get_qa_need_list_v2, remove_overlaps_low_confidence_spans, \
remove_overlaps_min_spans, remove_overlaps_chars

os.environ['NO_ALBUMENTATIONS_UPDATE'] = '1' # 禁止albumentations检查更新

Expand Down Expand Up @@ -120,6 +121,10 @@ def fill_char_in_spans(spans, all_chars):
empty_spans = []

for span in spans:

# 移除同一个span中重叠的char
span['chars'] = remove_overlaps_chars(span['chars'])

chars_to_content(span)
# 有的span中虽然没有字但有一两个空的占位符,用宽高和content长度过滤
if len(span['content']) * span['height'] < span['width'] * 0.5:
Expand Down
25 changes: 25 additions & 0 deletions magic_pdf/pre_proc/ocr_span_list_modify.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,31 @@ def remove_overlaps_low_confidence_spans(spans):
return spans, dropped_spans


def remove_overlaps_chars(chars):
dropped_chars = []
# 删除重叠的char
for char1 in chars:
for char2 in chars:
if char1 != char2:
# char1 或 char2 任何一个都不应该在 dropped_chars 中
if char1 in dropped_chars or char2 in dropped_chars:
continue
else:
if calculate_iou(char1['bbox'], char2['bbox']) > 0.95:
char_need_remove = char1
if (
char_need_remove is not None
and char_need_remove not in dropped_chars
):
dropped_chars.append(char_need_remove)

if len(dropped_chars) > 0:
for char_need_remove in dropped_chars:
chars.remove(char_need_remove)

return chars


def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
Expand Down

0 comments on commit 5eb9fee

Please sign in to comment.