Skip to content

Commit

Permalink
Merge pull request #69 from myhloli/master
Browse files Browse the repository at this point in the history
change remove spans logic
  • Loading branch information
myhloli authored Apr 25, 2024
2 parents 7a74566 + fcf94b2 commit 6c8361f
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions magic_pdf/pre_proc/ocr_span_list_modify.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,19 @@
def remove_overlaps_min_spans(spans):
dropped_spans = []
# 删除重叠spans中较小的那些
for span1 in spans.copy():
for span2 in spans.copy():
for span1 in spans:
for span2 in spans:
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None:
spans.remove(bbox_to_remove)
bbox_to_remove['tag'] = DropTag.SPAN_OVERLAP
dropped_spans.append(bbox_to_remove)

if len(dropped_spans > 0):
for dropped_span in dropped_spans:
spans.remove(dropped_span)
dropped_span['tag'] = DropTag.SPAN_OVERLAP
return spans, dropped_spans


Expand Down

0 comments on commit 6c8361f

Please sign in to comment.