From 3f062ad7ed16d0cbeb565840219586d1aad670b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Tue, 23 Apr 2024 17:02:44 +0800 Subject: [PATCH 1/2] update confidence score 0.95->0.05 --- magic_pdf/model/magic_model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/magic_pdf/model/magic_model.py b/magic_pdf/model/magic_model.py index c0cecea3..95931c86 100644 --- a/magic_pdf/model/magic_model.py +++ b/magic_pdf/model/magic_model.py @@ -48,7 +48,7 @@ def __fix_by_confidence(self): need_remove_list = [] layout_dets = model_page_info["layout_dets"] for layout_det in layout_dets: - if layout_det["score"] < 0.95: + if layout_det["score"] <= 0.05: need_remove_list.append(layout_det) else: continue From 1146206164fb6352f82b003dd99583e5dbbec11c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E8=B5=B5=E5=B0=8F=E8=92=99?= Date: Tue, 23 Apr 2024 17:10:47 +0800 Subject: [PATCH 2/2] fix annotation --- magic_pdf/pre_proc/ocr_dict_merge.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/magic_pdf/pre_proc/ocr_dict_merge.py b/magic_pdf/pre_proc/ocr_dict_merge.py index 3b0302f4..c5c1cb31 100644 --- a/magic_pdf/pre_proc/ocr_dict_merge.py +++ b/magic_pdf/pre_proc/ocr_dict_merge.py @@ -120,14 +120,14 @@ def sort_blocks_by_layout(all_bboxes, layout_bboxes): # 如果是footnote则跳过 if block[7] == BlockType.Footnote: continue - block_bbox = [block[0], block[1], block[2], block[3]] + block_bbox = block[:4] if calculate_overlap_area_in_bbox1_area_ratio(block_bbox, layout_bbox) > 0.8: layout_blocks.append(block) # 如果layout_blocks不为空,则放入new_blocks中 if len(layout_blocks) > 0: new_blocks.append(layout_blocks) - # 从spans删除已经放入layout_sapns中的span + # 从all_bboxes删除已经放入layout_blocks中的block for layout_block in layout_blocks: all_bboxes.remove(layout_block)