Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(pdf_parse): remove tilted lines for better text extraction #1580

Merged
merged 1 commit into from
Jan 20, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions magic_pdf/pdf_parse_union_core_v2.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import copy
import math
import os
import re
import statistics
Expand Down Expand Up @@ -173,6 +174,21 @@ def calculate_char_in_span(char_bbox, span_bbox, char, span_height_radio=0.33):
return False


def remove_tilted_line(text_blocks):
for block in text_blocks:
remove_lines = []
for line in block['lines']:
cosine, sine = line['dir']
# 计算弧度值
angle_radians = math.atan2(sine, cosine)
# 将弧度值转换为角度值
angle_degrees = math.degrees(angle_radians)
if 2 < abs(angle_degrees) < 88:
remove_lines.append(line)
for line in remove_lines:
block['lines'].remove(line)


def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang):
# cid用0xfffd表示,连字符拆开
# text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXT_PRESERVE_WHITESPACE | fitz.TEXT_MEDIABOX_CLIP)['blocks']
Expand All @@ -183,6 +199,10 @@ def txt_spans_extract_v2(pdf_page, spans, all_bboxes, all_discarded_blocks, lang
# 自定义flags出现较多0xfffd,可能是pymupdf可以自行处理内置字典的pdf,不再使用
text_blocks_raw = pdf_page.get_text('rawdict', flags=fitz.TEXTFLAGS_TEXT)['blocks']
# text_blocks = pdf_page.get_text('dict', flags=fitz.TEXTFLAGS_TEXT)['blocks']

# 移除所有角度不为0或90的line
remove_tilted_line(text_blocks_raw)

all_pymu_chars = []
for block in text_blocks_raw:
for line in block['lines']:
Expand Down
Loading