From 1d08865f4a465dc2fe2a46d5dfeb14fd0c043695 Mon Sep 17 00:00:00 2001 From: myhloli Date: Wed, 22 Jan 2025 11:39:43 +0800 Subject: [PATCH 1/2] refactor(pdf_parse): uncomment char bbox validation logic - Restore commented code for filtering out characters with invalid bounding boxes - This change may affect the filtering of unnecessary characters in PDF parsing --- magic_pdf/pdf_parse_union_core_v2.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 2ba16854..74a0d778 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -118,9 +118,10 @@ def fill_char_in_spans(spans, all_chars): for char in all_chars: # 跳过非法bbox的char - x1, y1, x2, y2 = char['bbox'] - if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01: - continue + # x1, y1, x2, y2 = char['bbox'] + # if abs(x1 - x2) <= 0.01 or abs(y1 - y2) <= 0.01: + # continue + for span in spans: if calculate_char_in_span(char['bbox'], span['bbox'], char['c']): span['chars'].append(char) From c38060d5b9f5954af1ca77c3e152dd85ff0c79dd Mon Sep 17 00:00:00 2001 From: myhloli Date: Wed, 22 Jan 2025 14:24:05 +0800 Subject: [PATCH 2/2] fix(boxbase): handle cases where bounding box area is zero - Add a check to return 0 when either bbox1_area or bbox2_area is zero - This prevents division by zero errors when calculating IoU --- magic_pdf/libs/boxbase.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/magic_pdf/libs/boxbase.py b/magic_pdf/libs/boxbase.py index 52779a22..2813121b 100644 --- a/magic_pdf/libs/boxbase.py +++ b/magic_pdf/libs/boxbase.py @@ -185,10 +185,13 @@ def calculate_iou(bbox1, bbox2): bbox1_area = (bbox1[2] - bbox1[0]) * (bbox1[3] - bbox1[1]) bbox2_area = (bbox2[2] - bbox2[0]) * (bbox2[3] - bbox2[1]) + if any([bbox1_area == 0, bbox2_area == 0]): + return 0 + # Compute the intersection over union by taking the intersection area # and dividing it by the sum of both areas minus the intersection area - iou = intersection_area / float(bbox1_area + bbox2_area - - intersection_area) + iou = intersection_area / float(bbox1_area + bbox2_area - intersection_area) + return iou