From f70289f99e3997d803df7a881776a3fac0b7c747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Thu, 25 Apr 2024 16:12:40 +0800 Subject: [PATCH] fix remove error --- magic_pdf/pre_proc/ocr_detect_all_bboxes.py | 6 ++++-- magic_pdf/pre_proc/ocr_span_list_modify.py | 17 +++++++++-------- 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py index 9171b163..00f8385b 100644 --- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py +++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py @@ -71,7 +71,9 @@ def remove_need_drop_blocks(all_bboxes, discarded_blocks): for discarded_block in discarded_blocks: block_bbox = block[:4] if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, discarded_block['bbox']) > 0.6: - need_remove.append(block) + if block not in need_remove: + need_remove.append(block) + break if len(need_remove) > 0: for block in need_remove: @@ -90,7 +92,7 @@ def remove_overlaps_min_blocks(all_bboxes): overlap_box = get_minbox_if_overlap_by_ratio(block1_bbox, block2_bbox, 0.8) if overlap_box is not None: bbox_to_remove = next((block for block in all_bboxes if block[:4] == overlap_box), None) - if bbox_to_remove is not None: + if bbox_to_remove is not None and bbox_to_remove not in need_remove: need_remove.append(bbox_to_remove) if len(need_remove) > 0: diff --git a/magic_pdf/pre_proc/ocr_span_list_modify.py b/magic_pdf/pre_proc/ocr_span_list_modify.py index 1e56a980..6dc6a8d5 100644 --- a/magic_pdf/pre_proc/ocr_span_list_modify.py +++ b/magic_pdf/pre_proc/ocr_span_list_modify.py @@ -14,14 +14,14 @@ def remove_overlaps_min_spans(spans): if span1 != span2: overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65) if overlap_box is not None: - bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None) - if bbox_to_remove is not None: - dropped_spans.append(bbox_to_remove) + span_need_remove = next((span for span in spans if span['bbox'] == overlap_box), None) + if span_need_remove is not None and span_need_remove not in dropped_spans: + dropped_spans.append(span_need_remove) if len(dropped_spans) > 0: - for dropped_span in dropped_spans: - spans.remove(dropped_span) - dropped_span['tag'] = DropTag.SPAN_OVERLAP + for span_need_remove in dropped_spans: + spans.remove(span_need_remove) + span_need_remove['tag'] = DropTag.SPAN_OVERLAP return spans, dropped_spans @@ -33,8 +33,9 @@ def remove_spans_by_bboxes(spans, need_remove_spans_bboxes): for span in spans: for removed_bbox in need_remove_spans_bboxes: if calculate_overlap_area_in_bbox1_area_ratio(span['bbox'], removed_bbox) > 0.5: - need_remove_spans.append(span) - break + if span not in need_remove_spans: + need_remove_spans.append(span) + break if len(need_remove_spans) > 0: for span in need_remove_spans: