From ba6c17a9d91c54cc36edd4b15ad07b0327a1e05a Mon Sep 17 00:00:00 2001 From: myhloli Date: Mon, 20 Jan 2025 15:13:40 +0800 Subject: [PATCH] feat(pdf_parse): remove tilted lines for better text extraction - Add remove_tilted_line function to filter out lines with angles between 2 and 88 degrees - Integrate the new function into the text extraction process - Improve the accuracy of text block processing by removing non-horizontal/vertical lines --- magic_pdf/pdf_parse_union_core_v2.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/magic_pdf/pdf_parse_union_core_v2.py b/magic_pdf/pdf_parse_union_core_v2.py index 282f1011..2ba16854 100644 --- a/magic_pdf/pdf_parse_union_core_v2.py +++ b/magic_pdf/pdf_parse_union_core_v2.py @@ -1,4 +1,5 @@ import copy +import math import os import re import statistics @@ -173,6 +174,21 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33): return False +def remove_tilted_line(text_blocks): + for block in text_blocks: + remove_lines = [] + for line in block['lines']: + cosine, sine = line['dir'] + # 计算弧度值 + angle_radians = math.atan2(sine, cosine) + # 将弧度值转换为角度值 + angle_degrees = math.degrees(angle_radians) + if 2 < abs(angle_degrees) < 88: + remove_lines.append(line) + for line in remove_lines: + block['lines'].remove(line) + + def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang): # cid用0xfffd表示,连字符拆开 # text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks'] @@ -183,6 +199,10 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang # 自定义flags出现较多0xfffd,可能是pymupdf可以自行处理内置字典的pdf,不再使用 text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] # text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks'] + + # 移除所有角度不为0或90的line + remove_tilted_line(text_blocks_raw) + all_pymu_chars = [] for block in text_blocks_raw: for line in block['lines']: