From fcf94b2d9ce68fdbd0a727779e54853828d3cd60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Thu, 25 Apr 2024 15:33:09 +0800 Subject: [PATCH] change remove spans logic --- magic_pdf/pre_proc/ocr_span_list_modify.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/magic_pdf/pre_proc/ocr_span_list_modify.py b/magic_pdf/pre_proc/ocr_span_list_modify.py index f231136d..0d23aa81 100644 --- a/magic_pdf/pre_proc/ocr_span_list_modify.py +++ b/magic_pdf/pre_proc/ocr_span_list_modify.py @@ -9,16 +9,19 @@ def remove_overlaps_min_spans(spans): dropped_spans = [] # 删除重叠spans中较小的那些 - for span1 in spans.copy(): - for span2 in spans.copy(): + for span1 in spans: + for span2 in spans: if span1 != span2: overlap_box = get_minbox_if_overlap_by_ratio(span1['bbox'], span2['bbox'], 0.65) if overlap_box is not None: bbox_to_remove = next((span for span in spans if span['bbox'] == overlap_box), None) if bbox_to_remove is not None: - spans.remove(bbox_to_remove) - bbox_to_remove['tag'] = DropTag.SPAN_OVERLAP dropped_spans.append(bbox_to_remove) + + if len(dropped_spans > 0): + for dropped_span in dropped_spans: + spans.remove(dropped_span) + dropped_span['tag'] = DropTag.SPAN_OVERLAP return spans, dropped_spans