diff --git a/magic_pdf/libs/boxbase.py b/magic_pdf/libs/boxbase.py index 52779a22..2813121b 100644 --- a/magic_pdf/libs/boxbase.py +++ b/magic_pdf/libs/boxbase.py @@ -185,10 +185,13 @@ def calculate_iou(bbox1, bbox2): bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) + if any([bbox1_area == 0, bbox2_area == 0]): + return 0 + # Compute the intersection over union by taking the intersection area # and dividing it by the sum of both areas minus the intersection area - iou = intersection_area / float(bbox1_area + bbox2_area - - intersection_area) + iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area) + return iou diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 2ba16854..74a0d778 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -118,9 +118,10 @@ def fill_char_in_spans(spans, all_chars): for char in all_chars: # 跳过非法bbox的char - x1, y1, x2, y2 = char['bbox'] - if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01: - continue + # x1, y1, x2, y2 = char['bbox'] + # if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01: + # continue + for span in spans: if calculate_char_in_span(char['bbox'], span['bbox'], char['c']): span['chars'].append(char)