Skip to content

Commit

Permalink
Merge pull request #71 from myhloli/master
Browse files Browse the repository at this point in the history
fix remove error
  • Loading branch information
myhloli authored Apr 25, 2024
2 parents 8c64474 + f70289f commit 9a3fe26
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 10 deletions.
6 changes: 4 additions & 2 deletions magic_pdf/pre_proc/ocr_detect_all_bboxes.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks):
for discarded_block in discarded_blocks:
block_bbox = block[:4]
if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6:
need_remove.append(block)
if block not in need_remove:
need_remove.append(block)
break

if len(need_remove) > 0:
for block in need_remove:
Expand All @@ -90,7 +92,7 @@ def remove_overlaps_min_blocks(all_bboxes):
overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8)
if overlap_box is not None:
bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None)
if bbox_to_remove is not None:
if bbox_to_remove is not None and bbox_to_remove not in need_remove:
need_remove.append(bbox_to_remove)

if len(need_remove) > 0:
Expand Down
17 changes: 9 additions & 8 deletions magic_pdf/pre_proc/ocr_span_list_modify.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@ def remove_overlaps_min_spans(spans):
if span1 != span2:
overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65)
if overlap_box is not None:
bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if bbox_to_remove is not None:
dropped_spans.append(bbox_to_remove)
span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None)
if span_need_remove is not None and span_need_remove not in dropped_spans:
dropped_spans.append(span_need_remove)

if len(dropped_spans) > 0:
for dropped_span in dropped_spans:
spans.remove(dropped_span)
dropped_span['tag'] = DropTag.SPAN_OVERLAP
for span_need_remove in dropped_spans:
spans.remove(span_need_remove)
span_need_remove['tag'] = DropTag.SPAN_OVERLAP

return spans, dropped_spans

Expand All @@ -33,8 +33,9 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes):
for span in spans:
for removed_bbox in need_remove_spans_bboxes:
if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5:
need_remove_spans.append(span)
break
if span not in need_remove_spans:
need_remove_spans.append(span)
break

if len(need_remove_spans) > 0:
for span in need_remove_spans:
Expand Down

0 comments on commit 9a3fe26

Please sign in to comment.